org.aoju.bus.http.metric.suffix.Suffixes Maven / Gradle / Ivy
The newest version!
/*
* Copyright (C) 2017 Square, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.aoju.bus.http.metric.suffix;
import org.aoju.bus.core.io.source.BufferSource;
import org.aoju.bus.core.io.source.GzipSource;
import org.aoju.bus.core.toolkit.IoKit;
import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.net.IDN;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicBoolean;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* A database of public suffixes provided by
* publicsuffix.org.
*/
public final class Suffixes {
public static final String PUBLIC_SUFFIX_RESOURCE = "suffixes.gz";
private static final byte[] WILDCARD_LABEL = new byte[]{'*'};
private static final String[] EMPTY_RULE = new String[0];
private static final String[] PREVAILING_RULE = new String[]{"*"};
private static final byte EXCEPTION_MARKER = '!';
private static final Suffixes instance = new Suffixes();
/**
* True after we've attempted to read the list for the first time.
*/
private final AtomicBoolean listRead = new AtomicBoolean(false);
/**
* Used for concurrent threads reading the list for the first time.
*/
private final CountDownLatch readCompleteLatch = new CountDownLatch(1);
// The lists are held as a large array of UTF-8 bytes. This is to avoid allocating lots of strings
// that will likely never be used. Each rule is separated by '\n'. Please see the
// PublicSuffixListGenerator class for how these lists are generated.
// Guarded by this.
private byte[] publicSuffixListBytes;
private byte[] publicSuffixExceptionListBytes;
public static Suffixes get() {
return instance;
}
private static String binarySearchBytes(byte[] bytesToSearch, byte[][] labels, int labelIndex) {
int low = 0;
int high = bytesToSearch.length;
String match = null;
while (low < high) {
int mid = (low + high) / 2;
// Search for a '\n' that marks the start of a value. Don't go back past the start of the
// array.
while (mid > -1 && bytesToSearch[mid] != '\n') {
mid--;
}
mid++;
// Now look for the ending '\n'.
int end = 1;
while (bytesToSearch[mid + end] != '\n') {
end++;
}
int publicSuffixLength = (mid + end) - mid;
// Compare the bytes. Note that the file stores UTF-8 encoded bytes, so we must compare the
// unsigned bytes.
int compareResult;
int currentLabelIndex = labelIndex;
int currentLabelByteIndex = 0;
int publicSuffixByteIndex = 0;
boolean expectDot = false;
while (true) {
int byte0;
if (expectDot) {
byte0 = '.';
expectDot = false;
} else {
byte0 = labels[currentLabelIndex][currentLabelByteIndex] & 0xff;
}
int byte1 = bytesToSearch[mid + publicSuffixByteIndex] & 0xff;
compareResult = byte0 - byte1;
if (compareResult != 0) break;
publicSuffixByteIndex++;
currentLabelByteIndex++;
if (publicSuffixByteIndex == publicSuffixLength) break;
if (labels[currentLabelIndex].length == currentLabelByteIndex) {
// We've exhausted our current label. Either there are more labels to compare, in which
// case we expect a dot as the next character. Otherwise, we've checked all our labels.
if (currentLabelIndex == labels.length - 1) {
break;
} else {
currentLabelIndex++;
currentLabelByteIndex = -1;
expectDot = true;
}
}
}
if (compareResult < 0) {
high = mid - 1;
} else if (compareResult > 0) {
low = mid + end + 1;
} else {
// We found a match, but are the lengths equal?
int publicSuffixBytesLeft = publicSuffixLength - publicSuffixByteIndex;
int labelBytesLeft = labels[currentLabelIndex].length - currentLabelByteIndex;
for (int i = currentLabelIndex + 1; i < labels.length; i++) {
labelBytesLeft += labels[i].length;
}
if (labelBytesLeft < publicSuffixBytesLeft) {
high = mid - 1;
} else if (labelBytesLeft > publicSuffixBytesLeft) {
low = mid + end + 1;
} else {
// Found a match.
match = new String(bytesToSearch, mid, publicSuffixLength, UTF_8);
break;
}
}
}
return match;
}
/**
* Returns the effective top-level domain plus one (eTLD+1) by referencing the public suffix list.
* Returns null if the domain is a public suffix or a private address.
*
* Here are some examples:
{@code
* assertEquals("google.com", getEffectiveTldPlusOne("google.com"));
* assertEquals("google.com", getEffectiveTldPlusOne("www.google.com"));
* assertNull(getEffectiveTldPlusOne("com"));
* assertNull(getEffectiveTldPlusOne("localhost"));
* assertNull(getEffectiveTldPlusOne("mymacbook"));
* }
*
* @param domain A canonicalized domain. An International Domain Name (IDN) should be punycode
* encoded.
*/
public String getEffectiveTldPlusOne(String domain) {
if (domain == null) throw new NullPointerException("domain == null");
// We use UTF-8 in the list so we need to convert to Unicode.
String unicodeDomain = IDN.toUnicode(domain);
String[] domainLabels = unicodeDomain.split("\\.");
String[] rule = findMatchingRule(domainLabels);
if (domainLabels.length == rule.length && rule[0].charAt(0) != EXCEPTION_MARKER) {
// The domain is a public suffix.
return null;
}
int firstLabelOffset;
if (rule[0].charAt(0) == EXCEPTION_MARKER) {
// Exception rules hold the effective TLD plus one.
firstLabelOffset = domainLabels.length - rule.length;
} else {
// Otherwise the rule is for a public suffix, so we must take one more label.
firstLabelOffset = domainLabels.length - (rule.length + 1);
}
StringBuilder effectiveTldPlusOne = new StringBuilder();
String[] punycodeLabels = domain.split("\\.");
for (int i = firstLabelOffset; i < punycodeLabels.length; i++) {
effectiveTldPlusOne.append(punycodeLabels[i]).append('.');
}
effectiveTldPlusOne.deleteCharAt(effectiveTldPlusOne.length() - 1);
return effectiveTldPlusOne.toString();
}
private String[] findMatchingRule(String[] domainLabels) {
if (!listRead.get() && listRead.compareAndSet(false, true)) {
readTheListUninterruptibly();
} else {
try {
readCompleteLatch.await();
} catch (InterruptedException ignored) {
Thread.currentThread().interrupt(); // Retain interrupted status.
}
}
synchronized (this) {
if (publicSuffixListBytes == null) {
throw new IllegalStateException("Unable to load " + PUBLIC_SUFFIX_RESOURCE + " resource "
+ "from the classpath.");
}
}
// Break apart the domain into UTF-8 labels, i.e. foo.bar.com turns into [foo, bar, com].
byte[][] domainLabelsUtf8Bytes = new byte[domainLabels.length][];
for (int i = 0; i < domainLabels.length; i++) {
domainLabelsUtf8Bytes[i] = domainLabels[i].getBytes(UTF_8);
}
// Start by looking for exact matches. We start at the leftmost label. For example, foo.bar.com
// will look like: [foo, bar, com], [bar, com], [com]. The longest matching rule wins.
String exactMatch = null;
for (int i = 0; i < domainLabelsUtf8Bytes.length; i++) {
String rule = binarySearchBytes(publicSuffixListBytes, domainLabelsUtf8Bytes, i);
if (rule != null) {
exactMatch = rule;
break;
}
}
// In theory, wildcard rules are not restricted to having the wildcard in the leftmost position.
// In practice, wildcards are always in the leftmost position. For now, this implementation
// cheats and does not attempt every possible permutation. Instead, it only considers wildcards
// in the leftmost position. We assert this fact when we generate the public suffix file. If
// this assertion ever fails we'll need to refactor this implementation.
String wildcardMatch = null;
if (domainLabelsUtf8Bytes.length > 1) {
byte[][] labelsWithWildcard = domainLabelsUtf8Bytes.clone();
for (int labelIndex = 0; labelIndex < labelsWithWildcard.length - 1; labelIndex++) {
labelsWithWildcard[labelIndex] = WILDCARD_LABEL;
String rule = binarySearchBytes(publicSuffixListBytes, labelsWithWildcard, labelIndex);
if (rule != null) {
wildcardMatch = rule;
break;
}
}
}
// Exception rules only apply to wildcard rules, so only try it if we matched a wildcard.
String exception = null;
if (wildcardMatch != null) {
for (int labelIndex = 0; labelIndex < domainLabelsUtf8Bytes.length - 1; labelIndex++) {
String rule = binarySearchBytes(
publicSuffixExceptionListBytes, domainLabelsUtf8Bytes, labelIndex);
if (rule != null) {
exception = rule;
break;
}
}
}
if (exception != null) {
// Signal we've identified an exception rule.
exception = "!" + exception;
return exception.split("\\.");
} else if (exactMatch == null && wildcardMatch == null) {
return PREVAILING_RULE;
}
String[] exactRuleLabels = exactMatch != null
? exactMatch.split("\\.")
: EMPTY_RULE;
String[] wildcardRuleLabels = wildcardMatch != null
? wildcardMatch.split("\\.")
: EMPTY_RULE;
return exactRuleLabels.length > wildcardRuleLabels.length
? exactRuleLabels
: wildcardRuleLabels;
}
/**
* Reads the public suffix list treating the operation as uninterruptible. We always want to read
* the list otherwise we'll be left in a bad state. If the thread was interrupted prior to this
* operation, it will be re-interrupted after the list is read.
*/
private void readTheListUninterruptibly() {
boolean interrupted = false;
try {
while (true) {
try {
readTheList();
return;
} catch (InterruptedIOException e) {
Thread.interrupted(); // Temporarily clear the interrupted state.
interrupted = true;
} catch (IOException e) {
return;
}
}
} finally {
if (interrupted) {
Thread.currentThread().interrupt(); // Retain interrupted status.
}
}
}
private void readTheList() throws IOException {
byte[] publicSuffixListBytes;
byte[] publicSuffixExceptionListBytes;
InputStream resource = Suffixes.class.getResourceAsStream(PUBLIC_SUFFIX_RESOURCE);
if (resource == null) return;
try (BufferSource BufferSource = IoKit.buffer(new GzipSource(IoKit.source(resource)))) {
int totalBytes = BufferSource.readInt();
publicSuffixListBytes = new byte[totalBytes];
BufferSource.readFully(publicSuffixListBytes);
int totalExceptionBytes = BufferSource.readInt();
publicSuffixExceptionListBytes = new byte[totalExceptionBytes];
BufferSource.readFully(publicSuffixExceptionListBytes);
}
synchronized (this) {
this.publicSuffixListBytes = publicSuffixListBytes;
this.publicSuffixExceptionListBytes = publicSuffixExceptionListBytes;
}
readCompleteLatch.countDown();
}
/**
* Visible for testing.
*/
void setListBytes(byte[] publicSuffixListBytes, byte[] publicSuffixExceptionListBytes) {
this.publicSuffixListBytes = publicSuffixListBytes;
this.publicSuffixExceptionListBytes = publicSuffixExceptionListBytes;
listRead.set(true);
readCompleteLatch.countDown();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy