com.metreeca.mark.tasks.Check Maven / Gradle / Ivy
/*
* Copyright © 2019-2023 Metreeca srl
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metreeca.mark.tasks;
import com.metreeca.mark.*;
import org.apache.maven.plugin.logging.Log;
import org.w3c.dom.*;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.*;
import java.util.*;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import javax.xml.namespace.NamespaceContext;
import javax.xml.xpath.*;
import static com.metreeca.http.handlers.Publisher.variants;
import static com.metreeca.xml.codecs.HTML.html;
import static java.lang.String.format;
import static java.lang.System.currentTimeMillis;
import static java.net.URLDecoder.decode;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Map.entry;
import static java.util.stream.Collectors.toSet;
/**
* Site link checking task.
*
* Scans HTML files in the {@linkplain Opts#target() target} site folder and reports dangling links.
*/
public final class Check implements Task {
private static final Pattern AbsolutePattern=Pattern.compile("^\\w+:.*$");
private static final Pattern AnchoredAssetPattern=Pattern.compile("(?!\\.x?html)(\\.\\w+)#.*$");
@Override public void exec(final Mark mark) {
final Path target=mark.target();
final Log logger=mark.logger();
try ( final Stream walk=Files.walk(target) ) {
final long start=currentTimeMillis();
final Set> links=walk
.filter(Files::isRegularFile)
.flatMap(path -> Stream.concat(
// self link (used to list all known files)
Stream.of(entry(
target.relativize(path).toString(),
target.relativize(path).toString()
)),
// html links
path.toString().endsWith(".html")
? scan(target, path)
// !!! mark.logger().warn(format("%s / %s", path, error.getMessage()));
: Stream.empty()
))
.collect(toSet());
final Set internal=links.stream() // verified internal link targets
.map(Entry::getKey)
.collect(toSet());
final Set external=links.stream() // verified external link targets
.map(Entry::getValue)
.filter(AbsolutePattern.asPredicate())
// !!! external links
//.distinct()
//.filter(url -> {
// try {
//
// logger.info(format("checking %s", url));
//
// return validate(url);
//
// } catch ( final IOException e ) {
//
// logger.warn(e.toString());
//
// return false;
//
// }
//})
.collect(toSet());
final long broken=links.stream()
.filter(link -> variants(decode(link.getValue(), UTF_8)).noneMatch(internal::contains))
.filter(link -> Stream.of(link.getValue()).noneMatch(external::contains))
.peek(link -> logger.warn(format("%s ~› %s", link.getKey(), link.getValue())))
.count();
final long stop=currentTimeMillis();
if ( broken > 0 ) {
logger.warn(format("%d broken links", broken));
} else {
logger.info("no broken links");
}
if ( !links.isEmpty() ) {
logger.info(format("processed %,d files in %,.3f s", links.size(), (stop-start)/1000.0f));
}
} catch ( final IOException e ) {
throw new UncheckedIOException(e);
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
private Stream> scan(final Path base, final Path path) {
final String self=base.relativize(path).toString();
final Document document=parse(path);
return Stream.concat(
nodes(document, "//@id|//html:a/@name").map(Node::getTextContent) // internal anchor links
.map(anchor -> self+"#"+anchor)
.map(anchor -> entry(anchor, anchor)),
nodes(document, "//@href|//@src").map(Node::getTextContent) // external links
// normalize links
.map(link -> AbsolutePattern.matcher(link).matches() ? link
: link.startsWith("//") ? "http:"+link
: link.startsWith("#") ? self+link
: link.startsWith("/") ? Paths.get(".", clean(link)).normalize().toString()
: base.relativize(path.resolveSibling(clean(link))).normalize().toString()
)
// ignore anchors on asset files
.map(link -> AnchoredAssetPattern.matcher(link).replaceAll("$1"))
.map(link -> entry(self, decode(link, UTF_8)))
);
}
private String clean(final String link) { // fix glitches in local links
return index(query(link));
}
private String index(final String link) { // add missing index files
return link.endsWith("/") ? link+"index.html"
: link.endsWith(".") ? link+"/index.html"
: link;
}
private String query(final String link) { // remove query components (e.g. javadocs)
final int question=link.indexOf('?');
return question >= 0 ? link.substring(0, question) : link;
}
private Document parse(final Path path) {
try ( final InputStream stream=Files.newInputStream(path) ) {
return html(stream, UTF_8, path.toString());
} catch ( final IOException e ) {
throw new UncheckedIOException(e);
}
}
private Stream nodes(final Document document, final String query) {
try {
final XPath xpath=XPathFactory
.newInstance()
.newXPath();
xpath.setNamespaceContext(new NamespaceContext() {
@Override public String getNamespaceURI(final String prefix) {
return prefix.equals("html") ? "http://www.w3.org/1999/xhtml" : null;
}
@Override public String getPrefix(final String namespaceURI) {
throw new UnsupportedOperationException("prefix lookup");
}
@Override public Iterator getPrefixes(final String namespaceURI) {
throw new UnsupportedOperationException("prefixes lookup");
}
});
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(new NodeIterator((NodeList)xpath
.compile(query)
.evaluate(document, XPathConstants.NODESET)
), Spliterator.ORDERED), false);
} catch ( final XPathExpressionException e ) {
throw new RuntimeException("unable to evaluate xpath expression", e);
}
}
private boolean validate(final String url) throws IOException {
if ( url.startsWith("javascript:") ) {
return true;
} else if ( url.startsWith("http") ) {
final HttpURLConnection connection=(HttpURLConnection)new URL(url).openConnection();
connection.setRequestMethod("HEAD");
connection.setRequestProperty("User-Agent", ""
+"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6)"
+"AppleWebKit/537.36 (KHTML, like Gecko)"
+"Chrome/85.0.4183.102 "
+"Safari/537.36"
);
connection.setInstanceFollowRedirects(true);
connection.setConnectTimeout(2500);
connection.setReadTimeout(2500);
connection.connect();
return connection.getResponseCode()/100 == 2;
} else {
return !new URL(url).toString().isEmpty(); // only well-formedness tests;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
private static final class NodeIterator implements Iterator {
private final NodeList nodes;
private int next;
private NodeIterator(final NodeList nodes) { this.nodes=nodes; }
@Override public boolean hasNext() {
return next < nodes.getLength();
}
@Override public Node next() throws NoSuchElementException {
if ( !hasNext() ) {
throw new NoSuchElementException("no more iterator elements");
}
return nodes.item(next++);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy