org.apache.tika.parser.code.SourceCodeParser Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.code;

import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP;
import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.uwyn.jhighlight.renderer.Renderer;
import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;
import org.ccil.cowan.tagsoup.HTMLSchema;
import org.ccil.cowan.tagsoup.Schema;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/**
 * Generic Source code parser for Java, Groovy, C++.
 * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license
 *
 * @author Hong-Thai.Nguyen
 * @since 1.6
 */
public class SourceCodeParser extends AbstractEncodingDetectorParser {

    private static final long serialVersionUID = -4543476498190054160L;

    private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");

    private static final Map TYPES_TO_RENDERER = new HashMap() {
        private static final long serialVersionUID = -741976157563751152L;

        {
            put(MediaType.text("x-c++src"), CPP);
            put(MediaType.text("x-java-source"), JAVA);
            put(MediaType.text("x-groovy"), GROOVY);
        }
    };

    //Parse the HTML document
    private static final Schema HTML_SCHEMA = new HTMLSchema();

    public SourceCodeParser() {
        super();
    }

    public SourceCodeParser(EncodingDetector encodingDetector) {
        super(encodingDetector);
    }

    @Override
    public Set getSupportedTypes(ParseContext context) {
        return TYPES_TO_RENDERER.keySet();
    }

    @Override
    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        try (AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
            Charset charset = reader.getCharset();
            String mediaType = metadata.get(Metadata.CONTENT_TYPE);
            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
            if (mediaType != null && name != null) {
                MediaType type = MediaType.parse(mediaType);
                metadata.set(Metadata.CONTENT_TYPE, type.toString());
                metadata.set(Metadata.CONTENT_ENCODING, charset.name());

                StringBuilder out = new StringBuilder();
                String line;
                int nbLines = 0;
                while ((line = reader.readLine()) != null) {
                    out.append(line + System.getProperty("line.separator"));
                    String author = parserAuthor(line);
                    if (author != null) {
                        metadata.add(TikaCoreProperties.CREATOR, author);
                    }
                    nbLines++;
                }
                metadata.set("LoC", String.valueOf(nbLines));
                Renderer renderer = getRenderer(type.toString());

                String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);

                Schema schema = context.get(Schema.class, HTML_SCHEMA);

                org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
                parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
                parser.setContentHandler(handler);
                parser.parse(new InputSource(new StringReader(codeAsHtml)));
            }
        }

    }

    private Renderer getRenderer(String mimeType) {
        MediaType mt = MediaType.parse(mimeType);
        String type = TYPES_TO_RENDERER.get(mt);
        if (type == null) {
            throw new RuntimeException("unparseable content type " + mimeType);
        }
        return XhtmlRendererFactory.getRenderer(type);
    }


    private String parserAuthor(String line) {
        Matcher m = authorPattern.matcher(line);
        if (m.find()) {
            return m.group(1).trim();
        }

        return null;
    }
}