All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.metreeca.mark.tasks.Check Maven / Gradle / Ivy

There is a newer version: 0.9.1
Show newest version
/*
 * Copyright © 2019-2020 Metreeca srl
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
 *  file except in compliance with the License. You may obtain a copy of the License at
 *
 *          http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is
 * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 */

package com.metreeca.mark.tasks;

import com.metreeca.mark.*;

import org.apache.maven.plugin.logging.Log;
import org.ccil.cowan.tagsoup.Parser;
import org.w3c.dom.*;
import org.xml.sax.InputSource;

import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.*;
import javax.xml.transform.dom.DOMResult;
import javax.xml.transform.sax.SAXSource;
import javax.xml.xpath.*;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import static com.metreeca.mark.Mark.extension;
import static com.metreeca.mark.Mark.variants;
import static java.lang.String.format;
import static java.lang.System.currentTimeMillis;
import static java.util.stream.Collectors.toList;
import static java.util.stream.Collectors.toSet;

/**
 * Site link checking task.
 *
 * 

Scans HTML files in the {@linkplain Opts#target() target} site folder and reports dangling links.

*/ public final class Check implements Task { private static final Pattern URLPattern=Pattern.compile("^\\w+:.*$"); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @Override public void exec(final Mark mark) { final Path target=mark.target(); final Log logger=mark.logger(); try ( final Stream walk=Files.walk(target) ) { final long start=currentTimeMillis(); final List> links=walk .filter(Files::isRegularFile) .flatMap(path -> scan(target, path)) .collect(toList()); final Set internal=links.stream() // verified internal link targets .map(Map.Entry::getKey) .collect(toSet()); final Set external=links.stream() // verified external link targets .map(Map.Entry::getValue) .filter(URLPattern.asPredicate()) // !!! external links //.distinct() //.filter(url -> { // try { // // logger.info(format("checking %s", url)); // // return validate(url); // // } catch ( final IOException e ) { // // logger.warn(e.toString()); // // return false; // // } //}) .collect(toSet()); final long broken=links.stream() .filter(link -> variants(link.getValue()).noneMatch(internal::contains)) .filter(link -> Stream.of(link.getValue()).noneMatch(external::contains)) .peek(link -> logger.warn(format("%s ~› %s", link.getKey(), link.getValue()))) .count(); final long stop=currentTimeMillis(); if ( broken > 0 ) { logger.warn(format("%d broken links", broken)); } else { logger.info("no broken links"); } if ( !links.isEmpty() ) { logger.info(format("processed %,d files in %,.3f s", links.size(), (stop-start)/1000.0f)); } } catch ( final IOException e ) { throw new UncheckedIOException(e); } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// private Stream> scan(final Path base, final Path path) { final String self=base.relativize(path).toString(); if ( extension(path).equals(".html") ) { final Document document=document(path); return Stream.of( Stream.of(new SimpleImmutableEntry<>(self, self)), // document self flink nodes(document, "//@id|//html:a/@name") // anchors self links .map(Node::getTextContent) .map(anchor -> self+"#"+anchor) .map(anchor -> new SimpleImmutableEntry<>(anchor, anchor)), nodes(document, "//@href|//@src") // actual links .map(Node::getTextContent) .map(link -> URLPattern.matcher(link).matches() ? link : link.startsWith("//") ? "http:"+link : link.startsWith("#") ? self+link : base.relativize(path.resolveSibling(clean(link)).normalize()).toString() ) .map(link -> new SimpleImmutableEntry<>(self, link)) ).flatMap(stream -> stream); } else { return Stream.of(new SimpleImmutableEntry<>(self, self)); // document self flink } } private String clean(final String link) { // fix glitches in local links return index(query(link)); } private String index(final String link) { // add missing index files return link.endsWith("/") ? link+"index.html" : link.endsWith(".") ? link+"/index.html" : link; } private String query(final String link) { // remove query components (e.g. javadocs) final int question=link.indexOf('?'); return question >= 0 ? link.substring(0, question) : link; } private Document document(final Path path) { try ( final InputStream stream=Files.newInputStream(path) ) { final String uri=path.toUri().toString(); final InputSource input=new InputSource(); input.setSystemId(uri); input.setByteStream(stream); final Document document=DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); document.setDocumentURI(uri); TransformerFactory.newInstance().newTransformer().transform( new SAXSource(new Parser(), input), new DOMResult(document) ); return document; } catch ( final ParserConfigurationException e ) { throw new RuntimeException("unable to create document builder", e); } catch ( final TransformerConfigurationException e ) { throw new RuntimeException("unable to create transformer", e); } catch ( final TransformerException e ) { throw new RuntimeException("unable to parse document", e); } catch ( final IOException e ) { throw new UncheckedIOException(e); } } private Stream nodes(final Document document, final String query) { try { final XPath xpath=XPathFactory .newInstance() .newXPath(); xpath.setNamespaceContext(new NamespaceContext() { @Override public String getNamespaceURI(final String prefix) { return prefix.equals("html") ? "http://www.w3.org/1999/xhtml" : null; } @Override public String getPrefix(final String namespaceURI) { throw new UnsupportedOperationException("prefix lookup"); } @Override public Iterator getPrefixes(final String namespaceURI) { throw new UnsupportedOperationException("prefixes lookup"); } }); return StreamSupport.stream(Spliterators.spliteratorUnknownSize(new NodeIterator((NodeList)xpath .compile(query) .evaluate(document, XPathConstants.NODESET) ), Spliterator.ORDERED), false); } catch ( final XPathExpressionException e ) { throw new RuntimeException("unable to evaluate xpath expression", e); } } private boolean validate(final String url) throws IOException { if ( url.startsWith("javascript:") ) { return true; } else if ( url.startsWith("http") ) { final HttpURLConnection connection=(HttpURLConnection)new URL(url).openConnection(); connection.setRequestMethod("HEAD"); connection.setRequestProperty("User-Agent", "" +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6)" +"AppleWebKit/537.36 (KHTML, like Gecko)" +"Chrome/85.0.4183.102 " +"Safari/537.36" ); connection.setInstanceFollowRedirects(true); connection.setConnectTimeout(2500); connection.setReadTimeout(2500); connection.connect(); return connection.getResponseCode()/100 == 2; } else { new URL(url); // only well-formedness tests return true; } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// private static final class NodeIterator implements Iterator { private final NodeList nodes; private int next; private NodeIterator(final NodeList nodes) {this.nodes=nodes;} @Override public boolean hasNext() { return next < nodes.getLength(); } @Override public Node next() throws NoSuchElementException { if ( !hasNext() ) { throw new NoSuchElementException("no more iterator elements"); } return nodes.item(next++); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy