All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.ansj.app.extracting.Lexical Maven / Gradle / Ivy

The newest version!
package org.ansj.app.extracting;

import org.ansj.app.extracting.domain.Rule;
import org.ansj.app.extracting.domain.Token;
import org.ansj.app.extracting.exception.RuleFormatException;
import org.nlpcn.commons.lang.util.StringUtil;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Created by Ansj on 29/08/2017.
 */
public class Lexical {

	private static final Map MAP = new HashMap();

	static {
		MAP.put('(', ')');
		MAP.put('[', ']');
		MAP.put('{', '}');
	}


	public static Rule parse(String line) throws RuleFormatException {

		if (StringUtil.isBlank(line)) {
			return null;
		}

		List tokens = new ArrayList();

		Map groups = new HashMap();
		Map attr = new HashMap();

		double weight = 1d;

		String[] split = line.split("\t");

		String rule = split[0];

		List list = new ArrayList();

		for (int i = 0; i < rule.length(); i++) {
			switch (rule.charAt(i)) {
				case ' ':
					break;
				case '(':
				case '{':
				case '[':
					i = makeToken(list, rule, i);
					break;
				default:
					throw new RuleFormatException(rule + " not begin ( at index " + i);

			}

		}

		Token token = null;

		int index = 0;

		for (int i = 0; i < list.size(); i++) {
			String str = list.get(i);
			if (str.startsWith("(")) {
				Token tempToken = new Token(index, str);
				index++ ;
				tokens.add(tempToken);
				if (token != null) {
					token.setNext(tempToken);
				}
				token = tempToken;
			} else {
				token.addAttr(str);
			}
		}

		if (split.length > 1 && StringUtil.isNotBlank(split[1])) {
			String[] split1 = split[1].split(";");
			for (int i = 0; i < split1.length; i++) {
				String[] split2 = split1[i].split(":");
				if (split2[1].startsWith("(")) {
					attr.put(split2[0], split2[1].substring(1, split2[1].length() - 1));
				} else {
					String[] split3 = split2[1].split(",");
					int[] ints = new int[split3.length];
					for (int j = 0; j < ints.length; j++) {
						ints[j] = Integer.parseInt(split3[j].trim());
					}
					groups.put(split2[0], ints);
				}

			}
		}

		if (split.length > 2 && StringUtil.isNotBlank(split[2])) {
			weight = Double.parseDouble(split[2]);
		}

		return new Rule(line, tokens, groups, attr, weight);
	}

	private static int makeToken(List list, String rule, int i) {
		int num = 0;
		StringBuilder sb = new StringBuilder();
		char c;
		char begin = rule.charAt(i);
		char end = MAP.get(begin);
		int j = i;
		for (; j < rule.length(); j++) {
			c = rule.charAt(j);
			if (c == '\\') {
				j++;
				sb.append(rule.charAt(j));
				continue;
			}
			sb.append(c);
			if (c == begin) {
				num++;
			} else if (c == end) {
				num--;
			}

			if (num == 0) {
				break;
			}

		}
		list.add(sb.toString());
		return j;
	}


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy