overy.warc-indexer.3.0.0.source-code.reference.conf Maven / Gradle / Ivy
{
"warc" : {
"title" : "Default indexer config."
# Indexing configuration:
"index" : {
# What to extract:
"extract" : {
# Maximum payload size allowed to be kept wholly in RAM:
"inMemoryThreshold" : 10M,
# Maximum payload size that will be serialised out to disk instead of held in RAM:
"onDiskThreshold" : 100M,
# Content to extract
"content" : {
# Should we index the content body text?
"text" : true,
# Should we store the content body text?
"text_stored" : true,
# Extract UK Postcodes and geoindex?
"text_extract_postcodes": true,
# Run simple AFINN sentiment analysis?
"test_sentimentj": false,
# Run the Stanford NER?
"text_stanford_ner": false,
# Should we extract the fuzzy hash of the text?
"text_fuzzy_hash" : true,
# Extract list of elements used in HTML:
"elements_used" : true,
# Extract potential PDF problems using Apache PDFBox Preflight:
"extractApachePreflightErrors" : true,
# Extract image features:
"images" : {
"enabled" : true,
"maxSizeInBytes" : 1M,
"detectFaces" : true,
"dominantColours" : true,
# The random sampling rate:
# (where '1' means 'extract from all images',
# and '100' would mean 'extract from 1 out of every 100 images')
"analysisSamplingRate": 10
}
# Language profiles to load for langdetect
"language" : {
"enabled" : true,
langdetectprofiles : [ "af", "ar", "bg", "bn", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu", "he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "mk", "ml", "mr", "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sv", "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw" ]
}
# Extract the first bytes of the file (for shingling):
"first_bytes" : {
# Enabled?
"enabled" : false,
# Number of bytes to extract (>=4 to allow content_ffb to work):
"num_bytes" : 32
}
# Annotations file
"annotations" : {
"enabled" : false,
"file" : "/path/to/annotations.json",
"surt_prefix_file" : "/path/to/openAccessSurts.txt"
}
},
# Which linked entities to extract:
"linked" : {
"normalise" : false,
"resources" : false,
"hosts" : true,
"domains" : true,
"images" : true
},
# Restrict record types:
"record_type_include" : [
response, revisit
],
# Restrict response codes:
# works by matches starting with the characters, so "2" will match 2xx:
"response_include" : [
"2"
],
# Restrict protocols:
"protocol_include" : [
http,
https
],
# URLs to skip, e.g. ['robots.txt'] ???
"url_exclude" : [],
},
# Parameters relating to format identification:
"id" : {
# Allow tools to infer format from the resource URI (file extension):
"useResourceURI" : true
# DROID-specific config:
"droid" : {
"enabled" : true,
"useBinarySignaturesOnly" : false
},
},
# Parameters to control Apache Tika behaviour
"tika" : {
# Maximum length of text to extract:
"max_text_length" : 512K,
# Should we use the 'boilerpipe' text extractor?:
"use_boilerpipe": false,
# The parse timeout (for when Tika gets stuck):
"parse_timeout" : 300000,
# Formats to avoid processing
"exclude_mime" : [
"x-tar",
"x-gzip",
"bz",
"lz",
"compress",
"zip",
"javascript",
"css",
"octet-stream"
],
# Should we extract all the available metadata:
"extract_all_metadata": false
},
# Parameters to control the exclusion of results from the indexing process:
"exclusions" : {
# Exclusion enabled?
"enabled" : false,
# Default check interval before reloading the exclusions file, in seconds:
"check_interval" : 600,
# Exclusion URI/SURT prefix file:
"file" : "/path/to/exclude.txt"
}
},
# Solr configuration:
"solr" : {
# Use the hash+url as the ID for the documents
"use_hash_url_id": false
# Check SOLR for duplicates during indexing:
"check_solr_for_duplicates": false
# Server configuration:
"server" : "http://localhost:8080/solr/discovery",
# Solr document batch size for submissions:
"batch_size" : 50,
# Number of threads per Solr client:
"num_threads" : 1,
# Number of shards
"num_shards": 1,
# Is this a dummy-run? (i.e. should we NOT post to SOLR?)
"dummy_run" : false,
# Disable explicit commit
"disablecommit" : false,
# field-specific setup
"field_setup" : {
"default_max_length" : -1,
"fields" : {
"url" : { "max_length" : 2048 }, # de facto max length for GET URLs
"url_norm" : { "max_length" : 2048 },
"links" : { "max_length" : 2048 },
"content" : { "max_length" : 512K }, # Same as tika.max_text_length
},
},
},
# HTTP Proxy to use when talking to Solr (if any):
"http_proxy" : {
#"host" : explorer.bl.uk,
#"port" : 3127
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy