All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jbake.app.Crawler Maven / Gradle / Ivy

Go to download

JBake is a Java based open source static site/blog generator for developers.

There is a newer version: 2.7.0-rc.7
Show newest version
package org.jbake.app;

import com.orientechnologies.orient.core.record.impl.ODocument;
import org.apache.commons.configuration2.CompositeConfiguration;
import org.apache.commons.io.FilenameUtils;
import org.jbake.app.configuration.JBakeConfiguration;
import org.jbake.app.configuration.JBakeConfigurationFactory;
import org.jbake.model.DocumentModel;
import org.jbake.model.DocumentStatus;
import org.jbake.model.DocumentTypes;
import org.jbake.model.ModelAttributes;
import org.jbake.util.HtmlUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Date;
import java.util.Map;

/**
 * Crawls a file system looking for content.
 *
 * @author Jonathan Bullock [email protected]
 */
public class Crawler {

    private static final Logger logger = LoggerFactory.getLogger(Crawler.class);
    private final ContentStore db;
    private final JBakeConfiguration config;
    private final Parser parser;

    /**
     * @param db     Database instance for content
     * @param source Base directory where content directory is located
     * @param config Project configuration
     * @deprecated Use {@link #Crawler(ContentStore, JBakeConfiguration)} instead.
     * 

* Creates new instance of Crawler. */ @Deprecated public Crawler(ContentStore db, File source, CompositeConfiguration config) { this.db = db; this.config = new JBakeConfigurationFactory().createDefaultJbakeConfiguration(source, config); this.parser = new Parser(this.config); } /** * Creates new instance of Crawler. * * @param db Database instance for content * @param config Project configuration */ public Crawler(ContentStore db, JBakeConfiguration config) { this.db = db; this.config = config; this.parser = new Parser(config); } public void crawl() { crawl(config.getContentFolder()); logger.info("Content detected:"); for (String docType : DocumentTypes.getDocumentTypes()) { long count = db.getDocumentCount(docType); if (count > 0) { logger.info("Parsed {} files of type: {}", count, docType); } } } public void crawlDataFiles() { crawlDataFiles(config.getDataFolder()); logger.info("Data files detected:"); String docType = config.getDataFileDocType(); long count = db.getDocumentCount(docType); if (count > 0) { logger.info("Parsed {} files", count); } } /** * Crawl all files and folders looking for content. * * @param path Folder to start from */ private void crawl(File path) { File[] contents = path.listFiles(FileUtil.getFileFilter(config)); if (contents != null) { Arrays.sort(contents); for (File sourceFile : contents) { if (sourceFile.isFile()) { crawlFile(sourceFile); } else if (sourceFile.isDirectory()) { crawl(sourceFile); } } } } private void crawlFile(File sourceFile) { StringBuilder sb = new StringBuilder(); sb.append("Processing [").append(sourceFile.getPath()).append("]... "); String sha1 = buildHash(sourceFile); String uri = buildURI(sourceFile); DocumentStatus status = findDocumentStatus(uri, sha1); if (status == DocumentStatus.UPDATED) { sb.append(" : modified "); db.deleteContent(uri); } else if (status == DocumentStatus.IDENTICAL) { sb.append(" : same "); } else if (DocumentStatus.NEW == status) { sb.append(" : new "); } logger.info("{}", sb); if (status != DocumentStatus.IDENTICAL) { processSourceFile(sourceFile, sha1, uri); } } /** * Crawl all files and folders looking for data files. * * @param path Folder to start from */ private void crawlDataFiles(File path) { File[] contents = path.listFiles(FileUtil.getDataFileFilter()); if (contents != null) { Arrays.sort(contents); for (File sourceFile : contents) { if (sourceFile.isFile()) { StringBuilder sb = new StringBuilder(); sb.append("Processing [").append(sourceFile.getPath()).append("]... "); String sha1 = buildHash(sourceFile); String uri = buildDataFileURI(sourceFile); boolean process = true; DocumentStatus status = DocumentStatus.NEW; String docType = config.getDataFileDocType(); status = findDocumentStatus(uri, sha1); if (status == DocumentStatus.UPDATED) { sb.append(" : modified "); db.deleteContent(uri); } else if (status == DocumentStatus.IDENTICAL) { sb.append(" : same "); process = false; } if (!process) { break; } if (DocumentStatus.NEW == status) { sb.append(" : new "); } if (process) { // new or updated crawlDataFile(sourceFile, sha1, uri, docType); } logger.info("{}", sb); } if (sourceFile.isDirectory()) { crawlDataFiles(sourceFile); } } } } private String buildHash(final File sourceFile) { String sha1; try { sha1 = FileUtil.sha1(sourceFile); } catch (Exception e) { logger.error("unable to build sha1 hash for source file '{}'", sourceFile); sha1 = ""; } return sha1; } private String buildURI(final File sourceFile) { String uri = FileUtil.asPath(sourceFile).replace(FileUtil.asPath(config.getContentFolder()), ""); if (useNoExtensionUri(uri)) { // convert URI from xxx.html to xxx/index.html uri = createNoExtensionUri(uri); } else { uri = createUri(uri); } // strip off leading / to enable generating non-root based sites if (uri.startsWith(FileUtil.URI_SEPARATOR_CHAR)) { uri = uri.substring(1); } return uri; } private String buildDataFileURI(final File sourceFile) { String uri = FileUtil.asPath(sourceFile).replace(FileUtil.asPath(config.getDataFolder()), ""); // strip off leading / if (uri.startsWith(FileUtil.URI_SEPARATOR_CHAR)) { uri = uri.substring(1, uri.length()); } return uri; } // TODO: Refactor - parametrize the following two methods into one. // commons-codec's URLCodec could be used when we add that dependency. private String createUri(String uri) { try { return FileUtil.URI_SEPARATOR_CHAR + FilenameUtils.getPath(uri) + URLEncoder.encode(FilenameUtils.getBaseName(uri), StandardCharsets.UTF_8.name()) + config.getOutputExtension(); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Missing UTF-8 encoding??", e); // Won't happen unless JDK is broken. } } private String createNoExtensionUri(String uri) { try { return FileUtil.URI_SEPARATOR_CHAR + FilenameUtils.getPath(uri) + URLEncoder.encode(FilenameUtils.getBaseName(uri), StandardCharsets.UTF_8.name()) + FileUtil.URI_SEPARATOR_CHAR + "index" + config.getOutputExtension(); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Missing UTF-8 encoding??", e); // Won't happen unless JDK is broken. } } private boolean useNoExtensionUri(String uri) { boolean noExtensionUri = config.getUriWithoutExtension(); String noExtensionUriPrefix = config.getPrefixForUriWithoutExtension(); return noExtensionUri && (noExtensionUriPrefix != null) && (noExtensionUriPrefix.length() > 0) && uri.startsWith(noExtensionUriPrefix); } private void crawlDataFile(final File sourceFile, final String sha1, final String uri, final String documentType) { try { DocumentModel document = parser.processFile(sourceFile); if (document != null) { document.setSha1(sha1); document.setRendered(true); document.setFile(sourceFile.getPath()); document.setSourceUri(uri); document.setType(documentType); db.addDocument(document); } else { logger.warn("{} couldn't be parsed so it has been ignored!", sourceFile); } } catch (Exception ex) { throw new RuntimeException("Failed crawling file: " + sourceFile.getPath() + " " + ex.getMessage(), ex); } } private void processSourceFile(final File sourceFile, final String sha1, final String uri) { DocumentModel document = parser.processFile(sourceFile); if (document != null) { if (DocumentTypes.contains(document.getType())) { addAdditionalDocumentAttributes(document, sourceFile, sha1, uri); if (config.getImgPathUpdate()) { // Prevent image source url's from breaking HtmlUtil.fixImageSourceUrls(document, config); } db.addDocument(document); } else { logger.warn("{} has an unknown document type '{}' and has been ignored!", sourceFile, document.getType()); } } else { logger.warn("{} has an invalid header, it has been ignored!", sourceFile); } } private void addAdditionalDocumentAttributes(DocumentModel document, File sourceFile, String sha1, String uri) { document.setRootPath(getPathToRoot(sourceFile)); document.setSha1(sha1); document.setRendered(false); document.setFile(sourceFile.getPath()); document.setSourceUri(uri); document.setUri(uri); document.setCached(true); if (document.getStatus().equals(ModelAttributes.Status.PUBLISHED_DATE) && (document.getDate() != null) && new Date().after(document.getDate())) { document.setStatus(ModelAttributes.Status.PUBLISHED); } if (config.getUriWithoutExtension()) { document.setNoExtensionUri(uri.replace("/index.html", "/")); } } private String getPathToRoot(File sourceFile) { return FileUtil.getUriPathToContentRoot(config, sourceFile); } private DocumentStatus findDocumentStatus(String uri, String sha1) { DocumentList match = db.getDocumentStatus(uri); if (!match.isEmpty()) { DocumentModel document = match.get(0); String oldHash = document.getSha1(); if (!oldHash.equals(sha1) || !document.getRendered()) { return DocumentStatus.UPDATED; } else { return DocumentStatus.IDENTICAL; } } else { return DocumentStatus.NEW; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy