overy.warc-indexer.3.0.0.source-code.reference.conf Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of warc-indexer Show documentation
There is a newer version: 3.3.0
{
    "warc" : {
        "title" : "Default indexer config."
        # Indexing configuration:
        "index" : {
            # What to extract:
            "extract" : {
                # Maximum payload size allowed to be kept wholly in RAM:
                "inMemoryThreshold" : 10M,
                # Maximum payload size that will be serialised out to disk instead of held in RAM:
                "onDiskThreshold" : 100M,
                
                # Content to extract
                "content" : {
                    # Should we index the content body text?
                    "text" : true,
                    
                    # Should we store the content body text?
                    "text_stored" : true,
                    
                    # Extract UK Postcodes and geoindex?
                    "text_extract_postcodes": true,

                    # Run simple AFINN sentiment analysis?                    
                    "test_sentimentj": false,
                     
                    # Run the Stanford NER?
                    "text_stanford_ner": false,
                    
                    # Should we extract the fuzzy hash of the text?
                    "text_fuzzy_hash" : true,
                    
                    # Extract list of elements used in HTML:
                    "elements_used" : true,
                    
                    # Extract potential PDF problems using Apache PDFBox Preflight:
                    "extractApachePreflightErrors" : true,
                    
                    # Extract image features:
                    "images" : {
                        "enabled" : true,
                        "maxSizeInBytes" : 1M,
                    	"detectFaces" : true,
                    	"dominantColours" : true,
                        # The random sampling rate:
                        # (where '1' means 'extract from all images', 
                        # and '100' would mean 'extract from 1 out of every 100 images')
                        "analysisSamplingRate": 10
                    }
                    
                    # Language profiles to load for langdetect
                    "language" : {
                        "enabled" : true,
                        langdetectprofiles : [ "af", "ar", "bg", "bn", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu", "he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "mk", "ml", "mr", "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sv", "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw" ]
                    }

                    # Extract the first bytes of the file (for shingling):
                    "first_bytes" : {
                        # Enabled?
                        "enabled" : false,
                        # Number of bytes to extract (>=4 to allow content_ffb to work):
                        "num_bytes" : 32
                    }

                    # Annotations file
                    "annotations" : {
                        "enabled" : false,
                        "file" : "/path/to/annotations.json",
                        "surt_prefix_file" : "/path/to/openAccessSurts.txt"
                    }

                },
                
                # Which linked entities to extract:
                "linked" : {
                    "normalise" : false,
                    "resources" : false,
                    "hosts" : true,
                    "domains" : true,
                    "images" : true
                },
                
                # Restrict record types:
                "record_type_include" : [
                    response, revisit
                ],
                
                # Restrict response codes:
                # works by matches starting with the characters, so "2" will match 2xx:
                "response_include" : [
                    "2"
                ],
    
                # Restrict protocols:
                "protocol_include" : [
                    http,
                    https
                ],
   
                # URLs to skip, e.g. ['robots.txt'] ???
                "url_exclude" : [],
            },
            
            # Parameters relating to format identification:   
            "id" : {
                # Allow tools to infer format from the resource URI (file extension):
                "useResourceURI" : true
    
                # DROID-specific config:
                "droid" : {
                    "enabled" : true,
                    "useBinarySignaturesOnly" : false
                },
            },
            
            # Parameters to control Apache Tika behaviour
            "tika" : {
                # Maximum length of text to extract:
                "max_text_length" : 512K,
                # Should we use the 'boilerpipe' text extractor?:
                "use_boilerpipe": false,
                # The parse timeout (for when Tika gets stuck):
                "parse_timeout" : 300000,
                # Formats to avoid processing
                "exclude_mime" : [
                    "x-tar",
                    "x-gzip",
                    "bz",
                    "lz",
                    "compress",
                    "zip",
                    "javascript",
                    "css",
                    "octet-stream"
                ],
                # Should we extract all the available metadata:
                "extract_all_metadata": false
            },
            
            # Parameters to control the exclusion of results from the indexing process:
            "exclusions" : {
                # Exclusion enabled?
                "enabled" : false,
                # Default check interval before reloading the exclusions file, in seconds:
                "check_interval" : 600,
                # Exclusion URI/SURT prefix file:
                "file" : "/path/to/exclude.txt"
            }
        },
        
        # Solr configuration:
        "solr" : {
            # Use the hash+url as the ID for the documents
            "use_hash_url_id": false
            # Check SOLR for duplicates during indexing:
            "check_solr_for_duplicates": false
            # Server configuration:
            "server" : "http://localhost:8080/solr/discovery",
            # Solr document batch size for submissions:
            "batch_size" : 50,
            # Number of threads per Solr client:
            "num_threads" : 1,
            # Number of shards
            "num_shards": 1,
            # Is this a dummy-run? (i.e. should we NOT post to SOLR?)
            "dummy_run" : false,
            # Disable explicit commit
            "disablecommit" : false,

            # field-specific setup
            "field_setup" : {
                "default_max_length" : -1,
                "fields" : {
                    "url" :      { "max_length" : 2048 }, # de facto max length for GET URLs
                    "url_norm" : { "max_length" : 2048 },
                    "links" :    { "max_length" : 2048 },
                    "content" :  { "max_length" : 512K }, # Same as tika.max_text_length
                },
            },
        },
        
        # HTTP Proxy to use when talking to Solr (if any):
        "http_proxy" : {
            #"host" : explorer.bl.uk,
            #"port" : 3127
        }
    }
}