ka.tika-eval.1.26.source-code.lucene-analyzers.json Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of tika-eval Show documentation

There is a newer version: 3.0.0

{
  "analyzers": {
    "general": {
      "charfilters": [
        {
          "factory": "mapping",
          "params": {
            "mapping": "/lucene-char-mapping.txt"
          }
        }
      ],
      "tokenizer": {
        "factory": "uax29urlemail",
        "params": {}
      },
      "tokenfilters": [
        {
          "factory": "icufolding",
          "params": {}
        },
        {
          "factory": "cjkbigram",
          "params": {
            "outputUnigrams": "false"
          }
        }
      ]
    },
    "common_tokens": {
      "_comment" : "Use this analyzer for counting common tokens in a corpus.",
      "_comment" : "This isn't used by tika-eval during profiling or comparing",
      "tokenizer": {
        "factory": "uax29urlemail",
        "params": {}
      },
      "tokenfilters": [
        {
          "factory": "urlemailnormalizing",
          "params": {
          }
        },
        {
          "factory": "alphaideograph",
          "params": {}
        },
        {
          "factory": "icufolding",
          "params": {}
        },
        {
          "factory": "cjkbigram",
          "params": {
            "outputUnigrams": "false"
          }
        },
        {
          "factory": "cjkbigramawarelength",
          "params": {
            "min": 3,
            "max": 20
          }
        }
      ]
    }
  }
}