org.apache.gobblin.ingestion.google.webmaster.UrlTriePrefixGrouper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of google-ingestion Show documentation
Show all versions of google-ingestion Show documentation
A distributed data integration framework for streaming and batch data ecosystems.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gobblin.ingestion.google.webmaster;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.lang3.tuple.Triple;
public class UrlTriePrefixGrouper implements UrlGrouper {
private final int _groupSize;
private final UrlTrie _trie;
private final Iterator> _iterator;
private Triple _retVal;
public UrlTriePrefixGrouper(UrlTrie trie, int groupSize) {
_trie = trie;
_groupSize = groupSize;
_iterator = new UrlTriePostOrderIterator(trie, groupSize);
}
@Override
public boolean hasNext() {
if (_retVal != null) {
return true;
}
while (_iterator.hasNext() && _retVal == null) {
Pair nextPair = _iterator.next();
UrlTrieNode nextNode = nextPair.getRight();
if (nextNode.getSize() <= _groupSize) {
_retVal = Triple.of(nextPair.getLeft(), GoogleWebmasterFilter.FilterOperator.CONTAINS, nextNode);
return true;
} else if (nextNode.isExist()) {
_retVal = Triple.of(nextPair.getLeft(), GoogleWebmasterFilter.FilterOperator.EQUALS, nextNode);
return true;
}
}
return false;
}
@Override
public Triple next() {
if (hasNext()) {
Triple retVal = _retVal;
_retVal = null;
return retVal;
}
throw new NoSuchElementException();
}
public UrlTrie getTrie() {
return _trie;
}
/**
* Get the detailed pages under this group
*/
public static ArrayList groupToPages(Triple group) {
ArrayList ret = new ArrayList<>();
if (group.getMiddle().equals(GoogleWebmasterFilter.FilterOperator.EQUALS)) {
if (group.getRight().isExist()) {
ret.add(group.getLeft());
}
} else if (group.getMiddle().equals(GoogleWebmasterFilter.FilterOperator.CONTAINS)) {
UrlTrie trie = new UrlTrie(group.getLeft(), group.getRight());
Iterator> iterator = new UrlTriePostOrderIterator(trie, 1);
while (iterator.hasNext()) {
Pair next = iterator.next();
if (next.getRight().isExist()) {
ret.add(next.getLeft());
}
}
}
return ret;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public int getGroupSize() {
return _groupSize;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy