okhttp3.internal.publicsuffix.PublicSuffixDatabase.kt Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of impersonator Show documentation
Spoof TLS/JA3/JA4 and HTTP/2 fingerprints in Java
The newest version!
/*
 * Copyright (C) 2017 Square, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package okhttp3.internal.publicsuffix

import java.io.IOException
import java.io.InterruptedIOException
import java.net.IDN
import java.nio.charset.StandardCharsets.UTF_8
import java.util.concurrent.CountDownLatch
import java.util.concurrent.atomic.AtomicBoolean
import okhttp3.internal.and
import okhttp3.internal.platform.Platform
import okio.GzipSource
import okio.buffer
import okio.source

/**
 * A database of public suffixes provided by [publicsuffix.org][publicsuffix_org].
 *
 * [publicsuffix_org]: https://publicsuffix.org/
 */
class PublicSuffixDatabase {

  /** True after we've attempted to read the list for the first time. */
  private val listRead = AtomicBoolean(false)

  /** Used for concurrent threads reading the list for the first time. */
  private val readCompleteLatch = CountDownLatch(1)

  // The lists are held as a large array of UTF-8 bytes. This is to avoid allocating lots of strings
  // that will likely never be used. Each rule is separated by '\n'. Please see the
  // PublicSuffixListGenerator class for how these lists are generated.
  // Guarded by this.
  private lateinit var publicSuffixListBytes: ByteArray
  private lateinit var publicSuffixExceptionListBytes: ByteArray

  /**
   * Returns the effective top-level domain plus one (eTLD+1) by referencing the public suffix list.
   * Returns null if the domain is a public suffix or a private address.
   *
   * Here are some examples:
   *
   * ```
   * assertEquals("google.com", getEffectiveTldPlusOne("google.com"));
   * assertEquals("google.com", getEffectiveTldPlusOne("www.google.com"));
   * assertNull(getEffectiveTldPlusOne("com"));
   * assertNull(getEffectiveTldPlusOne("localhost"));
   * assertNull(getEffectiveTldPlusOne("mymacbook"));
   * ```
   *
   * @param domain A canonicalized domain. An International Domain Name (IDN) should be punycode
   *     encoded.
   */
  fun getEffectiveTldPlusOne(domain: String): String? {
    // We use UTF-8 in the list so we need to convert to Unicode.
    val unicodeDomain = IDN.toUnicode(domain)
    val domainLabels = splitDomain(unicodeDomain)

    val rule = findMatchingRule(domainLabels)
    if (domainLabels.size == rule.size && rule[0][0] != EXCEPTION_MARKER) {
      return null // The domain is a public suffix.
    }

    val firstLabelOffset = if (rule[0][0] == EXCEPTION_MARKER) {
      // Exception rules hold the effective TLD plus one.
      domainLabels.size - rule.size
    } else {
      // Otherwise the rule is for a public suffix, so we must take one more label.
      domainLabels.size - (rule.size + 1)
    }

    return splitDomain(domain).asSequence().drop(firstLabelOffset).joinToString(".")
  }

  private fun splitDomain(domain: String): List {
    val domainLabels = domain.split('.')

    if (domainLabels.last() == "") {
      // allow for domain name trailing dot
      return domainLabels.dropLast(1)
    }

    return domainLabels
  }

  private fun findMatchingRule(domainLabels: List): List {
    if (!listRead.get() && listRead.compareAndSet(false, true)) {
      readTheListUninterruptibly()
    } else {
      try {
        readCompleteLatch.await()
      } catch (_: InterruptedException) {
        Thread.currentThread().interrupt() // Retain interrupted status.
      }
    }

    check(::publicSuffixListBytes.isInitialized) {
      "Unable to load $PUBLIC_SUFFIX_RESOURCE resource from the classpath."
    }

    // Break apart the domain into UTF-8 labels, i.e. foo.bar.com turns into [foo, bar, com].
    val domainLabelsUtf8Bytes = Array(domainLabels.size) { i -> domainLabels[i].toByteArray(UTF_8) }

    // Start by looking for exact matches. We start at the leftmost label. For example, foo.bar.com
    // will look like: [foo, bar, com], [bar, com], [com]. The longest matching rule wins.
    var exactMatch: String? = null
    for (i in domainLabelsUtf8Bytes.indices) {
      val rule = publicSuffixListBytes.binarySearch(domainLabelsUtf8Bytes, i)
      if (rule != null) {
        exactMatch = rule
        break
      }
    }

    // In theory, wildcard rules are not restricted to having the wildcard in the leftmost position.
    // In practice, wildcards are always in the leftmost position. For now, this implementation
    // cheats and does not attempt every possible permutation. Instead, it only considers wildcards
    // in the leftmost position. We assert this fact when we generate the public suffix file. If
    // this assertion ever fails we'll need to refactor this implementation.
    var wildcardMatch: String? = null
    if (domainLabelsUtf8Bytes.size > 1) {
      val labelsWithWildcard = domainLabelsUtf8Bytes.clone()
      for (labelIndex in 0 until labelsWithWildcard.size - 1) {
        labelsWithWildcard[labelIndex] = WILDCARD_LABEL
        val rule = publicSuffixListBytes.binarySearch(labelsWithWildcard, labelIndex)
        if (rule != null) {
          wildcardMatch = rule
          break
        }
      }
    }

    // Exception rules only apply to wildcard rules, so only try it if we matched a wildcard.
    var exception: String? = null
    if (wildcardMatch != null) {
      for (labelIndex in 0 until domainLabelsUtf8Bytes.size - 1) {
        val rule = publicSuffixExceptionListBytes.binarySearch(
            domainLabelsUtf8Bytes, labelIndex)
        if (rule != null) {
          exception = rule
          break
        }
      }
    }

    if (exception != null) {
      // Signal we've identified an exception rule.
      exception = "!$exception"
      return exception.split('.')
    } else if (exactMatch == null && wildcardMatch == null) {
      return PREVAILING_RULE
    }

    val exactRuleLabels = exactMatch?.split('.') ?: listOf()
    val wildcardRuleLabels = wildcardMatch?.split('.') ?: listOf()

    return if (exactRuleLabels.size > wildcardRuleLabels.size) {
      exactRuleLabels
    } else {
      wildcardRuleLabels
    }
  }

  /**
   * Reads the public suffix list treating the operation as uninterruptible. We always want to read
   * the list otherwise we'll be left in a bad state. If the thread was interrupted prior to this
   * operation, it will be re-interrupted after the list is read.
   */
  private fun readTheListUninterruptibly() {
    var interrupted = false
    try {
      while (true) {
        try {
          readTheList()
          return
        } catch (_: InterruptedIOException) {
          Thread.interrupted() // Temporarily clear the interrupted state.
          interrupted = true
        } catch (e: IOException) {
          Platform.get().log("Failed to read public suffix list", Platform.WARN, e)
          return
        }
      }
    } finally {
      if (interrupted) {
        Thread.currentThread().interrupt() // Retain interrupted status.
      }
    }
  }

  @Throws(IOException::class)
  private fun readTheList() {
    try {
      var publicSuffixListBytes: ByteArray?
      var publicSuffixExceptionListBytes: ByteArray?

      val resource =
        PublicSuffixDatabase::class.java.getResourceAsStream(PUBLIC_SUFFIX_RESOURCE) ?: return

      GzipSource(resource.source()).buffer().use { bufferedSource ->
        val totalBytes = bufferedSource.readInt()
        publicSuffixListBytes = bufferedSource.readByteArray(totalBytes.toLong())

        val totalExceptionBytes = bufferedSource.readInt()
        publicSuffixExceptionListBytes = bufferedSource.readByteArray(totalExceptionBytes.toLong())
      }

      synchronized(this) {
        this.publicSuffixListBytes = publicSuffixListBytes!!
        this.publicSuffixExceptionListBytes = publicSuffixExceptionListBytes!!
      }
    } finally {
      readCompleteLatch.countDown()
    }
  }

  /** Visible for testing. */
  fun setListBytes(
    publicSuffixListBytes: ByteArray,
    publicSuffixExceptionListBytes: ByteArray
  ) {
    this.publicSuffixListBytes = publicSuffixListBytes
    this.publicSuffixExceptionListBytes = publicSuffixExceptionListBytes
    listRead.set(true)
    readCompleteLatch.countDown()
  }

  companion object {
    const val PUBLIC_SUFFIX_RESOURCE = "publicsuffixes.gz"

    private val WILDCARD_LABEL = byteArrayOf('*'.toByte())
    private val PREVAILING_RULE = listOf("*")

    private const val EXCEPTION_MARKER = '!'

    private val instance = PublicSuffixDatabase()

    fun get(): PublicSuffixDatabase {
      return instance
    }

    private fun ByteArray.binarySearch(
      labels: Array,
      labelIndex: Int
    ): String? {
      var low = 0
      var high = size
      var match: String? = null
      while (low < high) {
        var mid = (low + high) / 2
        // Search for a '\n' that marks the start of a value. Don't go back past the start of the
        // array.
        while (mid > -1 && this[mid] != '\n'.toByte()) {
          mid--
        }
        mid++

        // Now look for the ending '\n'.
        var end = 1
        while (this[mid + end] != '\n'.toByte()) {
          end++
        }
        val publicSuffixLength = mid + end - mid

        // Compare the bytes. Note that the file stores UTF-8 encoded bytes, so we must compare the
        // unsigned bytes.
        var compareResult: Int
        var currentLabelIndex = labelIndex
        var currentLabelByteIndex = 0
        var publicSuffixByteIndex = 0

        var expectDot = false
        while (true) {
          val byte0: Int
          if (expectDot) {
            byte0 = '.'.toInt()
            expectDot = false
          } else {
            byte0 = labels[currentLabelIndex][currentLabelByteIndex] and 0xff
          }

          val byte1 = this[mid + publicSuffixByteIndex] and 0xff

          compareResult = byte0 - byte1
          if (compareResult != 0) break

          publicSuffixByteIndex++
          currentLabelByteIndex++
          if (publicSuffixByteIndex == publicSuffixLength) break

          if (labels[currentLabelIndex].size == currentLabelByteIndex) {
            // We've exhausted our current label. Either there are more labels to compare, in which
            // case we expect a dot as the next character. Otherwise, we've checked all our labels.
            if (currentLabelIndex == labels.size - 1) {
              break
            } else {
              currentLabelIndex++
              currentLabelByteIndex = -1
              expectDot = true
            }
          }
        }

        if (compareResult < 0) {
          high = mid - 1
        } else if (compareResult > 0) {
          low = mid + end + 1
        } else {
          // We found a match, but are the lengths equal?
          val publicSuffixBytesLeft = publicSuffixLength - publicSuffixByteIndex
          var labelBytesLeft = labels[currentLabelIndex].size - currentLabelByteIndex
          for (i in currentLabelIndex + 1 until labels.size) {
            labelBytesLeft += labels[i].size
          }

          if (labelBytesLeft < publicSuffixBytesLeft) {
            high = mid - 1
          } else if (labelBytesLeft > publicSuffixBytesLeft) {
            low = mid + end + 1
          } else {
            // Found a match.
            match = String(this, mid, publicSuffixLength, UTF_8)
            break
          }
        }
      }
      return match
    }
  }
}