All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.synonym.SolrSynonymParser Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.synonym;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.text.ParseException;
import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;

/**
 * Parser for the Solr synonyms format.
 * 
    *
  1. Blank lines and lines starting with '#' are comments. *
  2. Explicit mappings match any token sequence on the LHS of "=>" * and replace with all alternatives on the RHS. These types of mappings * ignore the expand parameter in the constructor. * Example: *
    i-pod, i pod => ipod
    *
  3. Equivalent synonyms may be separated with commas and give * no explicit mapping. In this case the mapping behavior will * be taken from the expand parameter in the constructor. This allows * the same synonym file to be used in different synonym handling strategies. * Example: *
    ipod, i-pod, i pod
    * *
  4. Multiple synonym mapping entries are merged. * Example: *
    * foo => foo bar
    * foo => baz

    * is equivalent to

    * foo => foo bar, baz *
    *
* @lucene.experimental */ public class SolrSynonymParser extends SynonymMap.Parser { private final boolean expand; public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { super(dedup, analyzer); this.expand = expand; } @Override public void parse(Reader in) throws IOException, ParseException { LineNumberReader br = new LineNumberReader(in); try { addInternal(br); } catch (IllegalArgumentException e) { ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); ex.initCause(e); throw ex; } finally { br.close(); } } private void addInternal(BufferedReader in) throws IOException { String line = null; while ((line = in.readLine()) != null) { if (line.length() == 0 || line.charAt(0) == '#') { continue; // ignore empty lines and comments } // TODO: we could process this more efficiently. String sides[] = split(line, "=>"); if (sides.length > 1) { // explicit mapping if (sides.length != 2) { throw new IllegalArgumentException("more than one explicit mapping specified on the same line"); } String inputStrings[] = split(sides[0], ","); CharsRef[] inputs = new CharsRef[inputStrings.length]; for (int i = 0; i < inputs.length; i++) { inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder()); } String outputStrings[] = split(sides[1], ","); CharsRef[] outputs = new CharsRef[outputStrings.length]; for (int i = 0; i < outputs.length; i++) { outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRefBuilder()); } // these mappings are explicit and never preserve original for (int i = 0; i < inputs.length; i++) { for (int j = 0; j < outputs.length; j++) { add(inputs[i], outputs[j], false); } } } else { String inputStrings[] = split(line, ","); CharsRef[] inputs = new CharsRef[inputStrings.length]; for (int i = 0; i < inputs.length; i++) { inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder()); } if (expand) { // all pairs for (int i = 0; i < inputs.length; i++) { for (int j = 0; j < inputs.length; j++) { if (i != j) { add(inputs[i], inputs[j], true); } } } } else { // all subsequent inputs map to first one; we also add inputs[0] here // so that we "effectively" (because we remove the original input and // add back a synonym with the same text) change that token's type to // SYNONYM (matching legacy behavior): for (int i = 0; i < inputs.length; i++) { add(inputs[i], inputs[0], false); } } } } } private static String[] split(String s, String separator) { ArrayList list = new ArrayList<>(2); StringBuilder sb = new StringBuilder(); int pos=0, end=s.length(); while (pos < end) { if (s.startsWith(separator,pos)) { if (sb.length() > 0) { list.add(sb.toString()); sb=new StringBuilder(); } pos+=separator.length(); continue; } char ch = s.charAt(pos++); if (ch=='\\') { sb.append(ch); if (pos>=end) break; // ERROR, or let it go? ch = s.charAt(pos++); } sb.append(ch); } if (sb.length() > 0) { list.add(sb.toString()); } return list.toArray(new String[list.size()]); } private String unescape(String s) { if (s.indexOf("\\") >= 0) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char ch = s.charAt(i); if (ch == '\\' && i < s.length() - 1) { sb.append(s.charAt(++i)); } else { sb.append(ch); } } return sb.toString(); } return s; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy