org.apache.lucene.analysis.synonym.SolrSynonymParser Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.text.ParseException;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
/**
* Parser for the Solr synonyms format.
*
* - Blank lines and lines starting with '#' are comments.
*
- Explicit mappings match any token sequence on the LHS of "=>"
* and replace with all alternatives on the RHS. These types of mappings
* ignore the expand parameter in the constructor.
* Example:
*
i-pod, i pod => ipod
* - Equivalent synonyms may be separated with commas and give
* no explicit mapping. In this case the mapping behavior will
* be taken from the expand parameter in the constructor. This allows
* the same synonym file to be used in different synonym handling strategies.
* Example:
*
ipod, i-pod, i pod
*
* - Multiple synonym mapping entries are merged.
* Example:
*
* foo => foo bar
* foo => baz
* is equivalent to
* foo => foo bar, baz
*
*
* @lucene.experimental
*/
public class SolrSynonymParser extends SynonymMap.Parser {
private final boolean expand;
public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
super(dedup, analyzer);
this.expand = expand;
}
@Override
public void parse(Reader in) throws IOException, ParseException {
LineNumberReader br = new LineNumberReader(in);
try {
addInternal(br);
} catch (IllegalArgumentException e) {
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
ex.initCause(e);
throw ex;
} finally {
br.close();
}
}
private void addInternal(BufferedReader in) throws IOException {
String line = null;
while ((line = in.readLine()) != null) {
if (line.length() == 0 || line.charAt(0) == '#') {
continue; // ignore empty lines and comments
}
// TODO: we could process this more efficiently.
String sides[] = split(line, "=>");
if (sides.length > 1) { // explicit mapping
if (sides.length != 2) {
throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
}
String inputStrings[] = split(sides[0], ",");
CharsRef[] inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder());
}
String outputStrings[] = split(sides[1], ",");
CharsRef[] outputs = new CharsRef[outputStrings.length];
for (int i = 0; i < outputs.length; i++) {
outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRefBuilder());
}
// these mappings are explicit and never preserve original
for (int i = 0; i < inputs.length; i++) {
for (int j = 0; j < outputs.length; j++) {
add(inputs[i], outputs[j], false);
}
}
} else {
String inputStrings[] = split(line, ",");
CharsRef[] inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder());
}
if (expand) {
// all pairs
for (int i = 0; i < inputs.length; i++) {
for (int j = 0; j < inputs.length; j++) {
if (i != j) {
add(inputs[i], inputs[j], true);
}
}
}
} else {
// all subsequent inputs map to first one; we also add inputs[0] here
// so that we "effectively" (because we remove the original input and
// add back a synonym with the same text) change that token's type to
// SYNONYM (matching legacy behavior):
for (int i = 0; i < inputs.length; i++) {
add(inputs[i], inputs[0], false);
}
}
}
}
}
private static String[] split(String s, String separator) {
ArrayList list = new ArrayList<>(2);
StringBuilder sb = new StringBuilder();
int pos=0, end=s.length();
while (pos < end) {
if (s.startsWith(separator,pos)) {
if (sb.length() > 0) {
list.add(sb.toString());
sb=new StringBuilder();
}
pos+=separator.length();
continue;
}
char ch = s.charAt(pos++);
if (ch=='\\') {
sb.append(ch);
if (pos>=end) break; // ERROR, or let it go?
ch = s.charAt(pos++);
}
sb.append(ch);
}
if (sb.length() > 0) {
list.add(sb.toString());
}
return list.toArray(new String[list.size()]);
}
private String unescape(String s) {
if (s.indexOf("\\") >= 0) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
if (ch == '\\' && i < s.length() - 1) {
sb.append(s.charAt(++i));
} else {
sb.append(ch);
}
}
return sb.toString();
}
return s;
}
}