
gobblin.ingestion.google.webmaster.TrieBasedProducerJob Maven / Gradle / Ivy
package gobblin.ingestion.google.webmaster;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.tuple.Triple;
public class TrieBasedProducerJob extends ProducerJob {
private final String _startDate;
private final String _endDate;
private final int _groupSize;
private final Triple _jobNode;
TrieBasedProducerJob(String startDate, String endDate,
Triple jobNode, int groupSize) {
_startDate = startDate;
_endDate = endDate;
_jobNode = jobNode;
_groupSize = groupSize;
}
@Override
public String getPage() {
return _jobNode.getLeft();
}
@Override
public String getStartDate() {
return _startDate;
}
@Override
public String getEndDate() {
return _endDate;
}
@Override
public GoogleWebmasterFilter.FilterOperator getOperator() {
return _jobNode.getMiddle();
}
@Override
public int getPagesSize() {
if (isOperatorEquals()) {
return 1;
} else {
return _jobNode.getRight().getSize();
}
}
/**
* The implementation here will first partition the job by pages, and then by dates.
* @return
*/
@Override
public List extends ProducerJob> partitionJobs() {
UrlTrieNode root = _jobNode.getRight();
if (isOperatorEquals() || root.getSize() == 1) {
//Either at an Equals-Node or a Leaf-Node, both of which actually has actual size 1.
return super.partitionJobs();
} else {
if (_groupSize <= 1) {
throw new RuntimeException("This is impossible. When group size is 1, the operator must be equals");
}
UrlTrie trie = new UrlTrie(getPage(), root);
int gs = Math.min(root.getSize(), _groupSize);
UrlTriePrefixGrouper grouper = new UrlTriePrefixGrouper(trie, (int) Math.ceil(gs / 2.0));
List jobs = new ArrayList<>();
while (grouper.hasNext()) {
jobs.add(new TrieBasedProducerJob(_startDate, _endDate, grouper.next(), grouper.getGroupSize()));
}
return jobs;
}
}
private boolean isOperatorEquals() {
return getOperator().equals(GoogleWebmasterFilter.FilterOperator.EQUALS);
}
@Override
public String toString() {
return String.format(
"TrieBasedProducerJob{_page='%s', _startDate='%s', _endDate='%s', _operator='%s', _groupSize='%s', _nodeSize='%s'}",
getPage(), _startDate, _endDate, getOperator(), _groupSize, _jobNode.getRight().getSize());
}
public int getGroupSize() {
return _groupSize;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy