All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.metreeca.mark.tasks.Check Maven / Gradle / Ivy

/*
 * Copyright © 2019-2023 Metreeca srl
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.metreeca.mark.tasks;

import com.metreeca.mark.*;

import org.apache.maven.plugin.logging.Log;
import org.w3c.dom.*;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.*;
import java.util.*;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import javax.xml.namespace.NamespaceContext;
import javax.xml.xpath.*;

import static com.metreeca.http.handlers.Publisher.variants;
import static com.metreeca.xml.codecs.HTML.html;

import static java.lang.String.format;
import static java.lang.System.currentTimeMillis;
import static java.net.URLDecoder.decode;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Map.entry;
import static java.util.stream.Collectors.toSet;

/**
 * Site link checking task.
 *
 * 

Scans HTML files in the {@linkplain Opts#target() target} site folder and reports dangling links.

*/ public final class Check implements Task { private static final Pattern AbsolutePattern=Pattern.compile("^\\w+:.*$"); private static final Pattern AnchoredAssetPattern=Pattern.compile("(?!\\.x?html)(\\.\\w+)#.*$"); @Override public void exec(final Mark mark) { final Path target=mark.target(); final Log logger=mark.logger(); try ( final Stream walk=Files.walk(target) ) { final long start=currentTimeMillis(); final Set> links=walk .filter(Files::isRegularFile) .flatMap(path -> Stream.concat( // self link (used to list all known files) Stream.of(entry( target.relativize(path).toString(), target.relativize(path).toString() )), // html links path.toString().endsWith(".html") ? scan(target, path) // !!! mark.logger().warn(format("%s / %s", path, error.getMessage())); : Stream.empty() )) .collect(toSet()); final Set internal=links.stream() // verified internal link targets .map(Entry::getKey) .collect(toSet()); final Set external=links.stream() // verified external link targets .map(Entry::getValue) .filter(AbsolutePattern.asPredicate()) // !!! external links //.distinct() //.filter(url -> { // try { // // logger.info(format("checking %s", url)); // // return validate(url); // // } catch ( final IOException e ) { // // logger.warn(e.toString()); // // return false; // // } //}) .collect(toSet()); final long broken=links.stream() .filter(link -> variants(decode(link.getValue(), UTF_8)).noneMatch(internal::contains)) .filter(link -> Stream.of(link.getValue()).noneMatch(external::contains)) .peek(link -> logger.warn(format("%s ~› %s", link.getKey(), link.getValue()))) .count(); final long stop=currentTimeMillis(); if ( broken > 0 ) { logger.warn(format("%d broken links", broken)); } else { logger.info("no broken links"); } if ( !links.isEmpty() ) { logger.info(format("processed %,d files in %,.3f s", links.size(), (stop-start)/1000.0f)); } } catch ( final IOException e ) { throw new UncheckedIOException(e); } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// private Stream> scan(final Path base, final Path path) { final String self=base.relativize(path).toString(); final Document document=parse(path); return Stream.concat( nodes(document, "//@id|//html:a/@name").map(Node::getTextContent) // internal anchor links .map(anchor -> self+"#"+anchor) .map(anchor -> entry(anchor, anchor)), nodes(document, "//@href|//@src").map(Node::getTextContent) // external links // normalize links .map(link -> AbsolutePattern.matcher(link).matches() ? link : link.startsWith("//") ? "http:"+link : link.startsWith("#") ? self+link : link.startsWith("/") ? Paths.get(".", clean(link)).normalize().toString() : base.relativize(path.resolveSibling(clean(link))).normalize().toString() ) // ignore anchors on asset files .map(link -> AnchoredAssetPattern.matcher(link).replaceAll("$1")) .map(link -> entry(self, decode(link, UTF_8))) ); } private String clean(final String link) { // fix glitches in local links return index(query(link)); } private String index(final String link) { // add missing index files return link.endsWith("/") ? link+"index.html" : link.endsWith(".") ? link+"/index.html" : link; } private String query(final String link) { // remove query components (e.g. javadocs) final int question=link.indexOf('?'); return question >= 0 ? link.substring(0, question) : link; } private Document parse(final Path path) { try ( final InputStream stream=Files.newInputStream(path) ) { return html(stream, UTF_8, path.toString()); } catch ( final IOException e ) { throw new UncheckedIOException(e); } } private Stream nodes(final Document document, final String query) { try { final XPath xpath=XPathFactory .newInstance() .newXPath(); xpath.setNamespaceContext(new NamespaceContext() { @Override public String getNamespaceURI(final String prefix) { return prefix.equals("html") ? "http://www.w3.org/1999/xhtml" : null; } @Override public String getPrefix(final String namespaceURI) { throw new UnsupportedOperationException("prefix lookup"); } @Override public Iterator getPrefixes(final String namespaceURI) { throw new UnsupportedOperationException("prefixes lookup"); } }); return StreamSupport.stream(Spliterators.spliteratorUnknownSize(new NodeIterator((NodeList)xpath .compile(query) .evaluate(document, XPathConstants.NODESET) ), Spliterator.ORDERED), false); } catch ( final XPathExpressionException e ) { throw new RuntimeException("unable to evaluate xpath expression", e); } } private boolean validate(final String url) throws IOException { if ( url.startsWith("javascript:") ) { return true; } else if ( url.startsWith("http") ) { final HttpURLConnection connection=(HttpURLConnection)new URL(url).openConnection(); connection.setRequestMethod("HEAD"); connection.setRequestProperty("User-Agent", "" +"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6)" +"AppleWebKit/537.36 (KHTML, like Gecko)" +"Chrome/85.0.4183.102 " +"Safari/537.36" ); connection.setInstanceFollowRedirects(true); connection.setConnectTimeout(2500); connection.setReadTimeout(2500); connection.connect(); return connection.getResponseCode()/100 == 2; } else { return !new URL(url).toString().isEmpty(); // only well-formedness tests; } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// private static final class NodeIterator implements Iterator { private final NodeList nodes; private int next; private NodeIterator(final NodeList nodes) { this.nodes=nodes; } @Override public boolean hasNext() { return next < nodes.getLength(); } @Override public Node next() throws NoSuchElementException { if ( !hasNext() ) { throw new NoSuchElementException("no more iterator elements"); } return nodes.item(next++); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy