All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.maven.doxia.module.markdown.MarkdownParser Maven / Gradle / Ivy

There is a newer version: 2.0.0
Show newest version
package org.apache.maven.doxia.module.markdown;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.maven.doxia.markup.HtmlMarkup;
import org.apache.maven.doxia.module.xhtml.XhtmlParser;
import org.apache.maven.doxia.parser.AbstractParser;
import org.apache.maven.doxia.parser.ParseException;
import org.apache.maven.doxia.parser.Parser;
import org.apache.maven.doxia.sink.Sink;
import org.codehaus.plexus.component.annotations.Component;
import org.codehaus.plexus.component.annotations.Requirement;
import org.codehaus.plexus.util.IOUtil;
import org.codehaus.plexus.util.xml.pull.XmlPullParser;
import org.pegdown.Extensions;
import org.pegdown.PegDownProcessor;
import org.pegdown.ast.HeaderNode;
import org.pegdown.ast.HtmlBlockNode;
import org.pegdown.ast.Node;
import org.pegdown.ast.RootNode;
import org.pegdown.ast.SuperNode;
import org.pegdown.ast.TextNode;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
 * 

* Defers effective parsing to the PegDown library, which generates HTML content * then delegates parsing of this content to a slightly modified Doxia Xhtml parser. * * @author Julien Nicoulaud * @since 1.3 * @see MarkdownToDoxiaHtmlSerializer */ @Component( role = Parser.class, hint = "markdown" ) public class MarkdownParser extends AbstractParser { /** * The role hint for the {@link MarkdownParser} Plexus component. */ public static final String ROLE_HINT = "markdown"; /** * The {@link PegDownProcessor} used to convert Pegdown documents to HTML. */ protected static final PegDownProcessor PEGDOWN_PROCESSOR = new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE ); /** * Regex that identifies a multimarkdown-style metadata section at the start of the document */ private static final String MULTI_MARKDOWN_METADATA_SECTION = "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)"; /** * Regex that captures the key and value of a multimarkdown-style metadata entry. */ private static final String MULTI_MARKDOWN_METADATA_ENTRY = "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n"; /** * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the * first key in the metadata section must be one of these standard keys or else the entire metadata section is * ignored. */ private static final String[] STANDARD_METADATA_KEYS = { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone", "subtitle" }; public int getType() { return TXT_TYPE; } @Requirement private PegDownHtmlParser parser; public void parse( Reader source, Sink sink ) throws ParseException { try { // Markdown to HTML (using Pegdown library) String html = toHtml( source ); // then HTML to Sink API parser.parse( new StringReader( html ), sink ); } catch ( IOException e ) { throw new ParseException( "Failed reading Markdown source document", e ); } } /** * uses PegDown library to parse content and generate HTML output. * * @param source the Markdown source * @return HTML content generated by PegDown * @throws IOException * @see MarkdownToDoxiaHtmlSerializer */ private String toHtml( Reader source ) throws IOException { String text = IOUtil.toString( source ); StringBuilder html = new StringBuilder( text.length() * 2 ); html.append( "" ); html.append( "" ); Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE ); Matcher metadataMatcher = metadataPattern.matcher( text ); boolean haveTitle = false; if ( metadataMatcher.find() ) { metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE ); Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) ); boolean first = true; while ( lineMatcher.find() ) { String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) ); if ( first ) { boolean found = false; for ( String k : STANDARD_METADATA_KEYS ) { if ( k.equalsIgnoreCase( key ) ) { found = true; break; } } if ( !found ) { break; } first = false; } String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) ); if ( "title".equalsIgnoreCase( key ) ) { haveTitle = true; html.append( "" ); html.append( StringEscapeUtils.escapeXml( value ) ); html.append( "" ); } else if ( "author".equalsIgnoreCase( key ) ) { html.append( "" ); } else if ( "date".equalsIgnoreCase( key ) ) { html.append( "" ); } else { html.append( "" ); } } if ( !first ) { text = text.substring( metadataMatcher.end() ); } } RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() ); if ( !haveTitle && rootNode.getChildren().size() > 0 ) { // use the first (non-comment) node only if it is a heading int i = 0; Node firstNode = null; while ( i < rootNode.getChildren().size() && isHtmlComment( ( firstNode = rootNode.getChildren().get( i ) ) ) ) { i++; } if ( firstNode instanceof HeaderNode ) { html.append( "" ); html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) ); html.append( "" ); } } html.append( "" ); html.append( "" ); html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) ); html.append( "" ); html.append( "" ); return html.toString(); } public static boolean isHtmlComment( Node node ) { if ( node instanceof HtmlBlockNode ) { HtmlBlockNode blockNode = (HtmlBlockNode) node; return blockNode.getText().startsWith( "