All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.language.LexicalDataImpl Maven / Gradle / Ivy

There is a newer version: 4.6.0
Show newest version
/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2020, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * https://www.carrot2.org/carrot2.LICENSE
 */
package org.carrot2.language;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.regex.Pattern;
import org.carrot2.util.ResourceLookup;

/**
 * {@link LexicalData} implemented on top of a hash set (stopwords) and a regular expression pattern
 * (stoplabels).
 */
public final class LexicalDataImpl implements LexicalData {
  private final HashSet stopwords;
  private final Pattern stoplabelPattern;

  public LexicalDataImpl(HashSet stopwords, Pattern stoplabelPattern) {
    this.stopwords = stopwords;
    this.stoplabelPattern = stoplabelPattern;
  }

  public LexicalDataImpl(ResourceLookup loader, String stopwordsResource, String stoplabelsResource)
      throws IOException {
    this(loadStopwords(loader, stopwordsResource), loadStoplabels(loader, stoplabelsResource));
  }

  /*
   *
   */
  @Override
  public boolean ignoreWord(CharSequence word) {
    return stopwords.contains(word.toString());
  }

  /*
   *
   */
  @Override
  public boolean ignoreLabel(CharSequence label) {
    if (this.stoplabelPattern == null) return false;

    return stoplabelPattern.matcher(label).matches();
  }

  private static Pattern loadStoplabels(ResourceLookup loader, String stoplabelsResource)
      throws IOException {
    List stoplabels;
    try (InputStream is = loader.open(stoplabelsResource);
        BufferedReader reader =
            new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
      stoplabels = compile(readLines(reader));
    }
    return union(stoplabels);
  }

  private static HashSet loadStopwords(ResourceLookup loader, String stopwordsResource)
      throws IOException {
    HashSet stopwords = new HashSet<>();
    try (InputStream is = loader.open(stopwordsResource);
        BufferedReader reader =
            new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
      readLines(reader).forEach(word -> stopwords.add(word.toLowerCase(Locale.ROOT)));
    }
    return stopwords;
  }

  /**
   * Loads words from a given resource (UTF-8, one word per line, #-starting lines are considered
   * comments).
   */
  private static HashSet readLines(BufferedReader reader) throws IOException {
    final HashSet words = new HashSet<>();
    String line;
    while ((line = reader.readLine()) != null) {
      line = line.trim();
      if (!line.startsWith("#") && !line.isEmpty()) {
        words.add(line);
      }
    }
    return words;
  }

  private static List compile(Set patterns) {
    ArrayList compiled = new ArrayList<>();
    for (String p : patterns) {
      compiled.add(Pattern.compile(p));
    }
    return compiled;
  }

  /**
   * Combines a number of patterns into a single pattern with a union of all of them. With
   * automata-based pattern engines, this should be faster and memory-friendly.
   */
  private static Pattern union(List patterns) {
    final StringBuilder union = new StringBuilder();
    if (patterns.size() > 0) {
      union.append("(");
      for (int i = 0; i < patterns.size(); i++) {
        if (i > 0) union.append(")|(");
        union.append(patterns.get(i).toString());
      }
      union.append(")");
      return Pattern.compile(union.toString());
    } else {
      return null;
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy