All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.owasp.url.Absolutizer Maven / Gradle / Ivy

Go to download

Declarative syntax for defining sets of URLs. No need for error-prone regexs.

There is a newer version: 1.2.4
Show newest version
// Copyright (c) 2017, Mike Samuel
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// Neither the name of the OWASP nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

package org.owasp.url;

import java.util.Arrays;
import java.util.EnumSet;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import org.owasp.url.Scheme.PartRanges;

/** Converts possibly relative URLs to absolute URLs. */
public final class Absolutizer {

  /**
   * Additional schemes recognized besides those defined in
   * {@link BuiltinScheme}.
   */
  public final SchemeLookupTable schemes;
  /**
   * An absolute, hierarchical URL that serves as the base for relative URLs.
   */
  public final String contextUrl;
  final Scheme contextScheme;
  final PartRanges contextRanges;
  final int contextEos;

  /**
   * @param schemes looks up schemes by name.
   * @param contextUrl An absolute, hierarchical URL that serves as the base
   *   for relative URLs.
   * @throws IllegalArgumentException if the contextUrl is malformed, relative,
   *   or uses an unrecognized scheme.
   */
  public Absolutizer(SchemeLookupTable schemes, String contextUrl) {
    this.schemes = schemes;
    this.contextUrl = contextUrl;

    int eos = endOfScheme(contextUrl);
    Preconditions.checkArgument(eos >= 0, "Missing scheme", contextUrl);
    this.contextEos = eos;
    this.contextScheme =
        schemes.schemeForName(contextUrl.substring(0, eos - 1 /* ':' */));
    Preconditions.checkArgument(
        Scheme.UNKNOWN != this.contextScheme,
        "Context URL has unrecognized scheme", contextUrl);
    this.contextRanges = Preconditions.checkNotNull(
        contextScheme.decompose(schemes, contextUrl, eos, contextUrl.length()),
        "Malformed context URL", contextUrl);
  }


  /**
   * Evaluates a URL reference in the context of an absolute URL.
   *
   * @param refUrlText The URL to absolutize.
   * @return A bundle of information gleaned from the absolutizing process.
   */
  Result absolutize(String refUrlText) {
    int eos = endOfScheme(refUrlText);
    EnumSet cornerCases = EnumSet.noneOf(
        UrlValue.CornerCase.class);

    Scheme scheme;
    PartRanges refUrlRanges, absUrlRanges;
    String absUrlText;
    if (eos >= 0) {
      scheme = schemes.schemeForName(
          refUrlText.substring(0, eos - 1 /* ':' */));
      refUrlRanges = scheme.decompose(
          schemes, refUrlText, eos, refUrlText.length());
      absUrlText = refUrlText;
      absUrlRanges = refUrlRanges;
      if (scheme.isHierarchical && refUrlRanges.pathRight >= 0) {
        StringBuilder sb = new StringBuilder(refUrlText.length());
        sb.append(refUrlText, 0, refUrlRanges.pathRight);
        removeDotSegmentsInPlace(sb, refUrlRanges.pathLeft, cornerCases);
        if (sb.length() != refUrlRanges.pathRight) {
          // Path normalization did some work.
          sb.append(refUrlText, refUrlRanges.pathRight, refUrlText.length());
          absUrlText = sb.toString();
          absUrlRanges = scheme.decompose(
              schemes, absUrlText, eos, absUrlText.length());
        }
      }
    } else {
      scheme = contextScheme;
      PartRanges crs = this.contextRanges;
      PartRanges rrs = refUrlRanges = scheme.decompose(
          schemes, refUrlText, 0, refUrlText.length());
//      System.err.println("ors=" + ors);
      // We have an example of a well-structured absolute URL with the
      // right scheme in contextURL.
      // Compute a set of substitutions into contextURL specified as
      // (left, right, content) tuples.
      // Then substitute them.

      // Collect enough information to create a ranges object
      // so we can recompose the URL.
      PartRanges.Builder abs = new PartRanges.Builder();

      // Collect parts on this buffer.
      StringBuilder partBuf = new StringBuilder(
          refUrlText.length() + contextUrl.length());

      // True if we have used a part from the given URL instead of
      // the context URL which indicates that we should defer to the
      // given URL for subseuqent parts.
      boolean usedGivenUrlPart = false;

      if (rrs.authorityLeft >= 0) {
        usedGivenUrlPart = true;
        int absAuthLeft = partBuf.length();
        partBuf.append(
            refUrlText, rrs.authorityLeft, rrs.authorityRight);
        abs.withAuthority(absAuthLeft, partBuf.length());
      } else if (crs.authorityLeft >= 0) {
        int absAuthLeft = partBuf.length();
        partBuf.append(
            contextUrl, crs.authorityLeft, crs.authorityRight);
        abs.withAuthority(absAuthLeft, partBuf.length());
      }
//      System.err.println("ors.pathLeft=" + ors.pathLeft);
//      System.err.println("ors.pathRight=" + ors.pathRight);
//      System.err.println("crs.pathLeft=" + crs.pathLeft);
//      System.err.println("crs.pathRight=" + crs.pathRight);
//      System.err.println("usedGivenUrlPart=" + usedGivenUrlPart);
      int absPathLeft = -1;
      if (rrs.pathLeft < rrs.pathRight || usedGivenUrlPart) {
        absPathLeft = partBuf.length();
        if (rrs.pathLeft >= 0) {
          if (rrs.pathLeft < rrs.pathRight
              && refUrlText.charAt(rrs.pathLeft) == '/') {
            // Absolute path.
            partBuf.append(refUrlText, rrs.pathLeft, rrs.pathRight);
          } else if (!usedGivenUrlPart) {
            // Relative path.
            // Append the context path.
            if (crs.pathLeft < crs.pathRight) {
              partBuf.append(contextUrl, crs.pathLeft, crs.pathRight);
              // Truncate at last '/'.
              // Absolutizing "foo" relative to "/bar/baz" is "/bar/foo"
              // but "foo" relative to "/bar/baz/" is "/bar/baz/foo".
              boolean truncated = false;
              for (int i = partBuf.length(); --i >= absPathLeft;) {
                if (partBuf.charAt(i) == '/') {
                  partBuf.setLength(i + 1);
                  truncated = true;
                  break;
                }
              }
              if (!truncated) {
                partBuf.setLength(absPathLeft);
              }
            } else if (crs.authorityLeft >= 0) {
              // https://tools.ietf.org/html/rfc3986#section-5.2.3 "Merge Paths" says
              // > If the base URI has a defined authority component and an empty
              // > path, then return a string consisting of "/" concatenated with the
              // > reference's path.
              partBuf.append('/');
            }
            // Append new path
            partBuf.append(refUrlText, rrs.pathLeft, rrs.pathRight);
          }
        }
        usedGivenUrlPart = true;
      } else if (crs.pathLeft >= 0) {
        absPathLeft = partBuf.length();
        partBuf.append(contextUrl, crs.pathLeft, crs.pathRight);
      }
      // Fixup . and ..
//      System.err.println("absPathLeft=" + absPathLeft + ", partBuf=" + partBuf);
      if (absPathLeft >= 0) {
        if (fixupEncodedDots(partBuf, absPathLeft)) {
          cornerCases.add(UrlValue.CornerCase.ENCODED_DOT_PATH_SEGMENST);
        }
        removeDotSegmentsInPlace(partBuf, absPathLeft, cornerCases);
        abs.withPath(absPathLeft, partBuf.length());
      }
//      System.err.println("absPathRight=" + absPathRight + ", partBuf=" + partBuf);

      if (rrs.contentLeft < rrs.contentRight
          || rrs.contentMetadataLeft < rrs.contentMetadataRight
          || usedGivenUrlPart) {
        usedGivenUrlPart = true;
        if (rrs.contentMetadataLeft >= 0) {
          int absContentMetadataLeft = partBuf.length();
          partBuf.append(
              refUrlText,
              rrs.contentMetadataLeft, rrs.contentMetadataRight);
          abs.withContentMetadata(absContentMetadataLeft, partBuf.length());
        }
        if (rrs.contentLeft >= 0) {
          int absContentLeft = partBuf.length();
          partBuf.append(refUrlText, rrs.contentLeft, rrs.contentRight);
          abs.withContent(absContentLeft, partBuf.length());
        }
      } else if (
          (crs.contentLeft >= 0 || crs.contentMetadataLeft >= 0)
          && !usedGivenUrlPart) {
        if (crs.contentMetadataLeft >= 0) {
          int absContentMetadataLeft = partBuf.length();
          partBuf.append(
              contextUrl, crs.contentMetadataLeft, crs.contentMetadataRight);
          abs.withContentMetadata(absContentMetadataLeft, partBuf.length());
        }
        if (crs.contentLeft >= 0) {
          int absContentLeft = partBuf.length();
          partBuf.append(contextUrl, crs.contentLeft, crs.contentRight);
          abs.withContent(absContentLeft, partBuf.length());
        }
      }

      if (rrs.queryLeft >= 0) {
        usedGivenUrlPart = true;
        int absQueryLeft = partBuf.length();
        partBuf.append(refUrlText, rrs.queryLeft, rrs.queryRight);
        abs.withQuery(absQueryLeft, partBuf.length());
      } else if (!usedGivenUrlPart && crs.queryLeft >= 0) {
        int absQueryLeft = partBuf.length();
        partBuf.append(contextUrl, crs.queryLeft, crs.queryRight);
        abs.withQuery(absQueryLeft, partBuf.length());
      }

      if (rrs.fragmentLeft >= 0) {
        int absFragmentLeft = partBuf.length();
        partBuf.append(refUrlText, rrs.fragmentLeft, rrs.fragmentRight);
        abs.withFragment(absFragmentLeft, partBuf.length());
      }
      // Do not inherit fragment from context URL.

      // Seed the buffer with the scheme.
      StringBuilder recomposed = new StringBuilder(partBuf.capacity());
      recomposed.append(contextUrl, 0, contextEos);
      PartRanges ranges = abs.build();
      contextScheme.recompose(partBuf, ranges, recomposed);
      absUrlText = recomposed.toString();
//    System.err.println("RECOMPOSED\n\tranges=" + ranges + "\n\tsource=" + partBuf + "\n\tresult=" + absUrlText);
      absUrlRanges = scheme.decompose(
          schemes, absUrlText, contextEos, absUrlText.length());
    }

    if (contextRanges.authorityLeft < 0 && refUrlRanges.authorityLeft < 0
        && absUrlRanges.pathRight - absUrlRanges.pathLeft >= 2
        && '/' == absUrlText.charAt(absUrlRanges.pathLeft)
        && '/' == absUrlText.charAt(absUrlRanges.pathLeft + 1)) {
      cornerCases.add(UrlValue.CornerCase.PATH_AUTHORITY_AMBIGUITY);
    }

    if (refUrlText.indexOf((char) 0) >= 0) {
      cornerCases.add(UrlValue.CornerCase.UNENCODED_NUL);
    }
    if (refUrlRanges != null) {
      if (CRLF.between(
          refUrlText, refUrlRanges.pathLeft, refUrlRanges.pathRight)) {
        cornerCases.add(UrlValue.CornerCase.NEWLINES_IN_PATH);
      }
      if (ASCII_DISALLOWED_AUTHORITY.betweenUnencoded(
          refUrlText, refUrlRanges.authorityLeft, refUrlRanges.authorityRight)) {
        cornerCases.add(UrlValue.CornerCase.AUTHORITY_NOT_ASCII_STRICT);
      }
    }

    return new Result(
        scheme, refUrlText, refUrlRanges, absUrlText, absUrlRanges, cornerCases);
  }


  /**
   * The result of absolutizing a URL along with structural information
   * found about the input and the output.
   */
  static final class Result {
    /** */
    public final Scheme scheme;
    /** */
    public final String originalUrlText;
    /** */
    public final PartRanges originalUrlRanges;
    /** */
    public final String absUrlText;
    /** */
    public final PartRanges absUrlRanges;

    public final ImmutableSet cornerCases;

    /** */
    public Result(
        Scheme scheme, String originalUrlText,
        PartRanges originalUrlRanges, String absUrlText, PartRanges absUrlRanges,
        EnumSet cornerCases) {
      this.scheme = scheme;
      this.originalUrlText = originalUrlText;
      this.originalUrlRanges = originalUrlRanges;
      this.absUrlText = absUrlText;
      this.absUrlRanges = absUrlRanges;
      this.cornerCases = Sets.immutableEnumSet(cornerCases);
    }
  }


  static int endOfScheme(String urlText) {
    int n = urlText.length();
    for (int i = 0; i < n; ++i) {
      char c = urlText.charAt(i);
      if (c == ':' && i != 0) {
        return i + 1;
      } else if (c == '/' || c == '?' || c == '#') {
        return -1;
      }
    }
    return -1;
  }

  private static final boolean DEBUG_RDS = false;
  static void removeDotSegmentsInPlace(StringBuilder path, int left,
      EnumSet cornerCases) {
    // The code below has excerpts from the spec interspersed.
    // The "input buffer" and "output buffer" referred to in the spec
    // are both just regions of path.
    // The loop deals with the exclusive cases by continuing instead
    // of proceeding to the bottom.
    boolean isAbsolute = left < path.length() && path.charAt(left) == '/';
    boolean dotDotNavigatesPastRoot = false;

    // RFC 3986 Section 5.2.4
    // 1.  The input buffer is initialized with the now-appended path
    //     components and the output buffer is initialized to the empty
    //     string.
    int inputBufferStart = left;
    final int inputBufferEnd = path.length();
    final int outputBufferStart = left;
    int outputBufferEnd = left;

    // 2.  While the input buffer is not empty, loop as follows:
    while (inputBufferStart < inputBufferEnd) {
      if (DEBUG_RDS) {
        System.err.println(
            "\t[" + path.substring(outputBufferStart, outputBufferEnd) + "]" +
                path.substring(outputBufferEnd, inputBufferStart) + "[" +
                path.substring(inputBufferStart, inputBufferEnd) + "]");
      }

      char c0 = path.charAt(inputBufferStart);
      //     A.  If the input buffer begins with a prefix of "../" or "./",
      //         then remove that prefix from the input buffer; otherwise,
      if (c0 == '.') {
        char c1;
        if (inputBufferStart + 1 < inputBufferEnd) {
          if ('/' == (c1 = path.charAt(inputBufferStart + 1))) {
            inputBufferStart += 2;
            continue;
          }
          if ('.' == c1 && inputBufferStart + 2 < inputBufferEnd
              && '/' == path.charAt(inputBufferStart + 2)) {
            inputBufferStart += 3;
            dotDotNavigatesPastRoot = true;
            continue;
          }
        }
      }

      //     B.  if the input buffer begins with a prefix of "/./" or "/.",
      //         where "." is a complete path segment, then replace that
      //         prefix with "/" in the input buffer; otherwise,
      if (c0 == '/' && inputBufferStart + 1 < inputBufferEnd
          && '.' == path.charAt(inputBufferStart + 1)) {
        if (inputBufferStart + 2 == inputBufferEnd) {
          inputBufferStart += 1;
          path.setCharAt(inputBufferStart, '/');
          continue;
        } else if ('/' == path.charAt(inputBufferStart + 2)) {
          inputBufferStart += 2;
          continue;
        }
      }

      //     C.  if the input buffer begins with a prefix of "/../" or "/..",
      //         where ".." is a complete path segment, then replace that
      //         prefix with "/" in the input buffer and remove the last
      //         segment and its preceding "/" (if any) from the output
      //         buffer; otherwise,
      if (c0 == '/' && inputBufferStart + 2 < inputBufferEnd
          && '.' == path.charAt(inputBufferStart + 1)
          && '.' == path.charAt(inputBufferStart + 2)) {
        boolean foundDotDot = false;
        if (inputBufferStart + 3 == inputBufferEnd) {
          inputBufferStart += 2;
          path.setCharAt(inputBufferStart, '/');
          foundDotDot = true;
        } else if ('/' == path.charAt(inputBufferStart + 3)) {
          inputBufferStart += 3;
          foundDotDot = true;
        }
        if (foundDotDot) {
          switch (outputBufferEnd - outputBufferStart) {
            // !!Extra not found in spec!!
            case 0:
              dotDotNavigatesPastRoot = true;
              break;
            case 1:
              if ('/' == path.charAt(outputBufferStart)) {
                dotDotNavigatesPastRoot = true;
              }
              break;
          }
          while (outputBufferEnd > outputBufferStart) {
            --outputBufferEnd;
            if (path.charAt(outputBufferEnd) == '/') { break; }
          }
          if (outputBufferEnd == outputBufferStart && !isAbsolute) {
            // !!!This differs from spec!!!
            // Do not convert relative URLs into absolute ones via parent
            // navigation.
            inputBufferStart += 1;
            cornerCases.add(
                UrlValue.CornerCase.RELATIVE_URL_MERGED_TO_ABSOLUTE);
          }
          continue;
        }
      }

      //     D.  if the input buffer consists only of "." or "..", then remove
      //         that from the input buffer; otherwise,
      if (c0 == '.') {
        if (inputBufferStart + 1 == inputBufferEnd) {
          inputBufferStart += 1;
          continue;
        } else if (inputBufferStart + 2 == inputBufferEnd
                   && '.' == path.charAt(inputBufferStart + 1)) {
          inputBufferStart += 2;
          dotDotNavigatesPastRoot = true;
          continue;
        }
      }

      //     E.  move the first path segment in the input buffer to the end of
      //         the output buffer, including the initial "/" character (if
      //         any) and any subsequent characters up to, but not including,
      //         the next "/" character or the end of the input buffer.
      do {
        path.setCharAt(outputBufferEnd++, path.charAt(inputBufferStart++));
      } while (inputBufferStart < inputBufferEnd
               && path.charAt(inputBufferStart) != '/');
    }

    // 3.  Finally, the output buffer is returned as the result of
    //     remove_dot_segments.
    path.setLength(outputBufferEnd);

    if (dotDotNavigatesPastRoot) {
      cornerCases.add(UrlValue.CornerCase.PATH_SIMPLIFICATION_REACHES_ROOT_PARENT);
    }
  }

  static final boolean RECODE_ENCODED_SPECIAL_PATH_SEGMENTS = false;
  static boolean fixupEncodedDots(StringBuilder partBuf, int pathLeft) {
    boolean needCompactLeft = false;
    boolean foundEncodedDotSegment = false;
    // We could, recode "%2e" to "." and similarly for double dots.
    // Enable the flag above to experiment with this.

    // We walk left to right and when we find a path segment that decodes to
    // "." or "..", but contains an encoded dot, we replace it in place.
    // We then compact left.
    // This works because both "." and ".." are strictly shorter than
    // the encoded form.
    int writeCursor = pathLeft;

    for (int i = pathLeft, n = partBuf.length(); i < n; ++i) {
      if (encodedDotAt(partBuf, i)) {
        // Look for a . or endoded dot to the left.
        int leftBound = i - 1 < pathLeft
            ? i
            : partBuf.charAt(i - 1) == '.'
            ? i - 1
            : i - 3 >= pathLeft && encodedDotAt(partBuf, i - 3)
            ? i - 3
            : i;
        int rightBound = leftBound != i || i + 3 == n
            ? i + 3
            : '.' == partBuf.charAt(i + 3)
            ? i + 4
            : encodedDotAt(partBuf, i + 3) ? i + 6 : i + 3;

        // We now know that partBuf[leftBound:rightBound] decodes to
        // "." or ".." with at least one dot encoded.
        if ((leftBound == pathLeft || partBuf.charAt(leftBound - 1) == '/')
            && (rightBound == n || partBuf.charAt(rightBound) == '/')) {
          foundEncodedDotSegment = true;
          if (RECODE_ENCODED_SPECIAL_PATH_SEGMENTS) {
            needCompactLeft = true;
            partBuf.setCharAt(writeCursor, '.');
            ++writeCursor;
            if (leftBound == i && rightBound != i + 3) {
              partBuf.setCharAt(writeCursor, '.');
              ++writeCursor;
            }
          }

          i = rightBound - 1;  // Don't bother considering this again.
          continue;
        }
      }

      // Compact left
      if (needCompactLeft) {
        partBuf.setCharAt(writeCursor, partBuf.charAt(i));
        ++writeCursor;
      }
    }
    if (needCompactLeft) {
      partBuf.setLength(writeCursor);  // Finish compacting left
    }

    return foundEncodedDotSegment;
  }

  private static boolean encodedDotAt(StringBuilder partBuf, int i) {
    return i + 2 < partBuf.length()
        && '%' == partBuf.charAt(i)
        && '2' == partBuf.charAt(i + 1)
        && 'e' == (partBuf.charAt(i + 2) | 32);
  }

  private static final AsciiSet CRLF;
  private static final AsciiSet ASCII_DISALLOWED_AUTHORITY;
  static {
    boolean[] crlf = new boolean[0x20];
    crlf['\n'] = crlf['\r'] = true;
    CRLF = new AsciiSet(crlf);
    boolean[] disallowed = new boolean[0x80];
    Arrays.fill(disallowed, 0, disallowed.length, true);
    disallowed['-'] = disallowed['.'] = disallowed['_'] = disallowed['~'] = false;
    for (int i = 'A'; i <= 'Z'; ++i) { disallowed[i] = disallowed[i | 32] = false; }
    for (int i = '0'; i <= '9'; ++i) { disallowed[i] = false; }
    // Allowed in Pct-decoded or in userinfo or as authority delimiter
    disallowed['%'] = disallowed[':'] = disallowed['@'] = false;
    // Allowed in IPV6 and IPFuture
    disallowed['['] = disallowed[']'] = false;
    // Sub-delims allowed in userInfo and regName
    disallowed['!'] = disallowed['$'] = disallowed['&'] = disallowed['\'']
        = disallowed['('] = disallowed[')'] = disallowed['*'] = disallowed['+']
        = disallowed[','] = disallowed[';'] = disallowed['='] = false;
    ASCII_DISALLOWED_AUTHORITY = new AsciiSet(disallowed);
  }
}

final class AsciiSet {
  private final boolean[] cps;

  AsciiSet(boolean[] cps) {
    this.cps = cps;
  }

  boolean betweenUnencoded(String s, int left, int right) {
    if (left >= 0) {
      for (int i = left; i < right; ++i) {
        char c = s.charAt(i);
        if (c < cps.length && cps[c]) { return true; }
      }
    }
    return false;
  }

  boolean between(String s, int left, int right) {
    if (left >= 0) {
      for (int i = left; i < right; ++i) {
        char c = s.charAt(i);
        if (c < cps.length && cps[c]) { return true; }
        if (c == '%') {
          int dc = Percent.pctHex2(s, i, right);
          if (0 <= dc && dc < cps.length && cps[dc]) {
            return true;
          }
        }
      }
    }
    return false;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy