All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.panforge.robotstxt.RobotsTxtImpl Maven / Gradle / Ivy

There is a newer version: 1.4.6
Show newest version
/*
 * Copyright 2016 Piotr Andzel.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.panforge.robotstxt;

import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

/**
 * Robots TXT implementation.
 */
class RobotsTxtImpl implements RobotsTxt {

  private Group defaultSection;
  private final List groups = new ArrayList<>();

  private Integer crawlDelay;
  private String host;
  private final List sitemaps = new ArrayList<>();

  private final MatchingStrategy matchingStrategy;
  private final WinningStrategy winningStrategy;

  /**
   * Creates instance of the RobotsTxt implementation
   *
   * @param matchingStrategy matchingStrategy
   * @param winningStrategy winning strategy
   */
  public RobotsTxtImpl(MatchingStrategy matchingStrategy, WinningStrategy winningStrategy) {
    this.matchingStrategy = matchingStrategy;
    this.winningStrategy = winningStrategy;
  }

  @Override
  public String getHost() {
    return host;
  }

  /**
   * Sets host.
   *
   * @param host host name
   */
  public void setHost(String host) {
    this.host = host;
  }

  @Override
  public List getSitemaps() {
    return sitemaps;
  }

  /**
   * Sets crawl delay.
   *
   * @param crawlDelay crawl delay.
   * @deprecated 
   */
  @Deprecated
  public void setCrawlDelay(Integer crawlDelay) {
    this.crawlDelay = crawlDelay;
  }

  @Override
  public Integer getCrawlDelay() {
    return crawlDelay;
  }

  @Override
  public List getDisallowList(String userAgent) {
    Group sec = findSectionByAgent(groups, userAgent, defaultSection);
    return sec != null
            ? sec.getAccessList().listAll().stream()
                    .filter(acc -> !acc.hasAccess())
                    .map(acc -> acc.getClause())
                    .collect(Collectors.toList())
            : Collections.emptyList();
  }

  @Override
  public boolean query(String userAgent, String path) {
    Grant grant = ask(userAgent, path);
    return grant.hasAccess();
  }

  @Override
  public Grant ask(String userAgent, String path) {
    List select = null;
    select = select(userAgent, path).stream().collect(Collectors.toList());
    Access winner = winningStrategy.selectWinner(select);
    return winner!=null? winner: createDefaultAccess();
  }

  /**
   * Adds section.
   *
   * @param section section
   */
  public void addGroup(Group section) {
    if (section != null) {
      if (section.isAnyAgent()) {
        if (this.defaultSection == null) {
          this.defaultSection = section;
        } else {
          this.defaultSection.getAccessList().importAccess(section.getAccessList());
        }
      } else {
        Group exact = findExactSection(section);
        if (exact == null) {
          groups.add(section);
        } else {
          exact.getAccessList().importAccess(section.getAccessList());
        }
      }
    }
  }

  @Override
  public String toString() {
    StringWriter sw = new StringWriter();
    PrintWriter pw = new PrintWriter(sw);

    if (defaultSection != null) {
      pw.println(defaultSection);
    }

    groups.forEach(group -> {
      pw.println(group);
    });

    if (host != null) {
      pw.format("Host: %s", host).println();
    }

    sitemaps.forEach(sitemap -> pw.format("Sitemap: %s", sitemap).println());

    pw.flush();

    return sw.toString();
  }

  /**
   * Finds exact section.
   *
   * @param section section to find exact
   * @return exact section or {@code null} if no exact section found
   */
  private Group findExactSection(Group section) {
    for (Group s : groups) {
      if (s.isExact(section)) {
        return s;
      }
    }
    return null;
  }

  private List select(String userAgent, String path) {
    String relativePath = assureRelative(path);

    if (relativePath != null && !"/robots.txt".equalsIgnoreCase(relativePath)) {
      ArrayList selected = new ArrayList<>();

      Group sec = findSectionByAgent(groups, userAgent, defaultSection);
      if (sec != null) {
        selected.addAll(sec.select(userAgent, relativePath, matchingStrategy));
      }
      if (selected.isEmpty()) {
        selected.add(createDefaultAccess());
      }
      return selected;
    } else {
      return Collections.EMPTY_LIST;
    }
  }

  private Group findSectionByAgent(List sections, String userAgent, Group defaultGroup) {
    for (Group sec : sections) {
      if (sec.matchUserAgent(userAgent)) {
        return sec;
      }
    }
    return defaultGroup;
  }

  private String assureRelative(String path) {
    try {
      URI uri = new URI(path);
      if (uri.isAbsolute()) {
        URL url = uri.toURL();
        path = String.format("/%s%s%s", url.getPath(), url.getQuery() != null ? "?" + url.getQuery() : "", url.getRef() != null ? "#" + url.getRef() : "").replaceAll("/+", "/");
      }
      return path;
    } catch (Exception ex) {
      return path;
    }
  }
  
  private Access createDefaultAccess() {
    return new Access(defaultSection,"","",true);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy